No OneTemporary
Actions

Size

6 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: projects/clang900-import/contrib/compiler-rt/lib/asan/asan_rtl.cc
	===================================================================
	--- projects/clang900-import/contrib/compiler-rt/lib/asan/asan_rtl.cc (revision 351721)
	+++ projects/clang900-import/contrib/compiler-rt/lib/asan/asan_rtl.cc (revision 351722)
	@@ -1,626 +1,626 @@
	//===-- asan_rtl.cc -------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file is a part of AddressSanitizer, an address sanity checker.
	//
	// Main file of the ASan run-time library.
	//===----------------------------------------------------------------------===//

	#include "asan_activation.h"
	#include "asan_allocator.h"
	#include "asan_interceptors.h"
	#include "asan_interface_internal.h"
	#include "asan_internal.h"
	#include "asan_mapping.h"
	#include "asan_poisoning.h"
	#include "asan_report.h"
	#include "asan_stack.h"
	#include "asan_stats.h"
	#include "asan_suppressions.h"
	#include "asan_thread.h"
	#include "sanitizer_common/sanitizer_atomic.h"
	#include "sanitizer_common/sanitizer_flags.h"
	#include "sanitizer_common/sanitizer_libc.h"
	#include "sanitizer_common/sanitizer_symbolizer.h"
	#include "lsan/lsan_common.h"
	#include "ubsan/ubsan_init.h"
	#include "ubsan/ubsan_platform.h"

	uptr __asan_shadow_memory_dynamic_address; // Global interface symbol.
	int __asan_option_detect_stack_use_after_return; // Global interface symbol.
	uptr *__asan_test_only_reported_buggy_pointer; // Used only for testing asan.

	namespace __asan {

	uptr AsanMappingProfile[kAsanMappingProfileSize];

	static void AsanDie() {
	static atomic_uint32_t num_calls;
	if (atomic_fetch_add(&num_calls, 1, memory_order_relaxed) != 0) {
	// Don't die twice - run a busy loop.
	while (1) { }
	}
	if (common_flags()->print_module_map >= 1) PrintModuleMap();
	if (flags()->sleep_before_dying) {
	Report("Sleeping for %d second(s)\n", flags()->sleep_before_dying);
	SleepForSeconds(flags()->sleep_before_dying);
	}
	if (flags()->unmap_shadow_on_exit) {
	if (kMidMemBeg) {
	UnmapOrDie((void*)kLowShadowBeg, kMidMemBeg - kLowShadowBeg);
	UnmapOrDie((void*)kMidMemEnd, kHighShadowEnd - kMidMemEnd);
	} else {
	if (kHighShadowEnd)
	UnmapOrDie((void*)kLowShadowBeg, kHighShadowEnd - kLowShadowBeg);
	}
	}
	}

	static void AsanCheckFailed(const char file, int line, const char cond,
	u64 v1, u64 v2) {
	Report("AddressSanitizer CHECK failed: %s:%d \"%s\" (0x%zx, 0x%zx)\n", file,
	line, cond, (uptr)v1, (uptr)v2);

	// Print a stack trace the first time we come here. Otherwise, we probably
	// failed a CHECK during symbolization.
	static atomic_uint32_t num_calls;
	if (atomic_fetch_add(&num_calls, 1, memory_order_relaxed) == 0) {
	PRINT_CURRENT_STACK_CHECK();
	}

	Die();
	}

	// -------------------------- Globals --------------------- {{{1
	int asan_inited;
	bool asan_init_is_running;

	#if !ASAN_FIXED_MAPPING
	uptr kHighMemEnd, kMidMemBeg, kMidMemEnd;
	#endif

	// -------------------------- Misc ---------------- {{{1
	void ShowStatsAndAbort() {
	__asan_print_accumulated_stats();
	Die();
	}

	// --------------- LowLevelAllocateCallbac ---------- {{{1
	static void OnLowLevelAllocate(uptr ptr, uptr size) {
	PoisonShadow(ptr, size, kAsanInternalHeapMagic);
	}

	// -------------------------- Run-time entry ------------------- {{{1
	// exported functions
	#define ASAN_REPORT_ERROR(type, is_write, size) \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_report_ ## type ## size(uptr addr) { \
	GET_CALLER_PC_BP_SP; \
	ReportGenericError(pc, bp, sp, addr, is_write, size, 0, true); \
	} \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_report_exp_ ## type ## size(uptr addr, u32 exp) { \
	GET_CALLER_PC_BP_SP; \
	ReportGenericError(pc, bp, sp, addr, is_write, size, exp, true); \
	} \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_report_ ## type ## size ## _noabort(uptr addr) { \
	GET_CALLER_PC_BP_SP; \
	ReportGenericError(pc, bp, sp, addr, is_write, size, 0, false); \
	} \

	ASAN_REPORT_ERROR(load, false, 1)
	ASAN_REPORT_ERROR(load, false, 2)
	ASAN_REPORT_ERROR(load, false, 4)
	ASAN_REPORT_ERROR(load, false, 8)
	ASAN_REPORT_ERROR(load, false, 16)
	ASAN_REPORT_ERROR(store, true, 1)
	ASAN_REPORT_ERROR(store, true, 2)
	ASAN_REPORT_ERROR(store, true, 4)
	ASAN_REPORT_ERROR(store, true, 8)
	ASAN_REPORT_ERROR(store, true, 16)

	#define ASAN_REPORT_ERROR_N(type, is_write) \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_report_ ## type ## _n(uptr addr, uptr size) { \
	GET_CALLER_PC_BP_SP; \
	ReportGenericError(pc, bp, sp, addr, is_write, size, 0, true); \
	} \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_report_exp_ ## type ## _n(uptr addr, uptr size, u32 exp) { \
	GET_CALLER_PC_BP_SP; \
	ReportGenericError(pc, bp, sp, addr, is_write, size, exp, true); \
	} \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_report_ ## type ## _n_noabort(uptr addr, uptr size) { \
	GET_CALLER_PC_BP_SP; \
	ReportGenericError(pc, bp, sp, addr, is_write, size, 0, false); \
	} \

	ASAN_REPORT_ERROR_N(load, false)
	ASAN_REPORT_ERROR_N(store, true)

	#define ASAN_MEMORY_ACCESS_CALLBACK_BODY(type, is_write, size, exp_arg, fatal) \
	if (SANITIZER_MYRIAD2 && !AddrIsInMem(addr) && !AddrIsInShadow(addr)) \
	return; \
	uptr sp = MEM_TO_SHADOW(addr); \
	uptr s = size <= SHADOW_GRANULARITY ? reinterpret_cast<u8 >(sp) \
	: reinterpret_cast<u16 >(sp); \
	if (UNLIKELY(s)) { \
	if (UNLIKELY(size >= SHADOW_GRANULARITY \|\| \
	((s8)((addr & (SHADOW_GRANULARITY - 1)) + size - 1)) >= \
	(s8)s)) { \
	if (__asan_test_only_reported_buggy_pointer) { \
	*__asan_test_only_reported_buggy_pointer = addr; \
	} else { \
	GET_CALLER_PC_BP_SP; \
	ReportGenericError(pc, bp, sp, addr, is_write, size, exp_arg, \
	fatal); \
	} \
	} \
	}

	#define ASAN_MEMORY_ACCESS_CALLBACK(type, is_write, size) \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_##type##size(uptr addr) { \
	ASAN_MEMORY_ACCESS_CALLBACK_BODY(type, is_write, size, 0, true) \
	} \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_exp_##type##size(uptr addr, u32 exp) { \
	ASAN_MEMORY_ACCESS_CALLBACK_BODY(type, is_write, size, exp, true) \
	} \
	extern "C" NOINLINE INTERFACE_ATTRIBUTE \
	void __asan_##type##size ## _noabort(uptr addr) { \
	ASAN_MEMORY_ACCESS_CALLBACK_BODY(type, is_write, size, 0, false) \
	} \

	ASAN_MEMORY_ACCESS_CALLBACK(load, false, 1)
	ASAN_MEMORY_ACCESS_CALLBACK(load, false, 2)
	ASAN_MEMORY_ACCESS_CALLBACK(load, false, 4)
	ASAN_MEMORY_ACCESS_CALLBACK(load, false, 8)
	ASAN_MEMORY_ACCESS_CALLBACK(load, false, 16)
	ASAN_MEMORY_ACCESS_CALLBACK(store, true, 1)
	ASAN_MEMORY_ACCESS_CALLBACK(store, true, 2)
	ASAN_MEMORY_ACCESS_CALLBACK(store, true, 4)
	ASAN_MEMORY_ACCESS_CALLBACK(store, true, 8)
	ASAN_MEMORY_ACCESS_CALLBACK(store, true, 16)

	extern "C"
	NOINLINE INTERFACE_ATTRIBUTE
	void __asan_loadN(uptr addr, uptr size) {
	if (__asan_region_is_poisoned(addr, size)) {
	GET_CALLER_PC_BP_SP;
	ReportGenericError(pc, bp, sp, addr, false, size, 0, true);
	}
	}

	extern "C"
	NOINLINE INTERFACE_ATTRIBUTE
	void __asan_exp_loadN(uptr addr, uptr size, u32 exp) {
	if (__asan_region_is_poisoned(addr, size)) {
	GET_CALLER_PC_BP_SP;
	ReportGenericError(pc, bp, sp, addr, false, size, exp, true);
	}
	}

	extern "C"
	NOINLINE INTERFACE_ATTRIBUTE
	void __asan_loadN_noabort(uptr addr, uptr size) {
	if (__asan_region_is_poisoned(addr, size)) {
	GET_CALLER_PC_BP_SP;
	ReportGenericError(pc, bp, sp, addr, false, size, 0, false);
	}
	}

	extern "C"
	NOINLINE INTERFACE_ATTRIBUTE
	void __asan_storeN(uptr addr, uptr size) {
	if (__asan_region_is_poisoned(addr, size)) {
	GET_CALLER_PC_BP_SP;
	ReportGenericError(pc, bp, sp, addr, true, size, 0, true);
	}
	}

	extern "C"
	NOINLINE INTERFACE_ATTRIBUTE
	void __asan_exp_storeN(uptr addr, uptr size, u32 exp) {
	if (__asan_region_is_poisoned(addr, size)) {
	GET_CALLER_PC_BP_SP;
	ReportGenericError(pc, bp, sp, addr, true, size, exp, true);
	}
	}

	extern "C"
	NOINLINE INTERFACE_ATTRIBUTE
	void __asan_storeN_noabort(uptr addr, uptr size) {
	if (__asan_region_is_poisoned(addr, size)) {
	GET_CALLER_PC_BP_SP;
	ReportGenericError(pc, bp, sp, addr, true, size, 0, false);
	}
	}

	// Force the linker to keep the symbols for various ASan interface functions.
	// We want to keep those in the executable in order to let the instrumented
	// dynamic libraries access the symbol even if it is not used by the executable
	// itself. This should help if the build system is removing dead code at link
	// time.
	static NOINLINE void force_interface_symbols() {
	volatile int fake_condition = 0; // prevent dead condition elimination.
	// __asan_report_* functions are noreturn, so we need a switch to prevent
	// the compiler from removing any of them.
	// clang-format off
	switch (fake_condition) {
	case 1: __asan_report_load1(0); break;
	case 2: __asan_report_load2(0); break;
	case 3: __asan_report_load4(0); break;
	case 4: __asan_report_load8(0); break;
	case 5: __asan_report_load16(0); break;
	case 6: __asan_report_load_n(0, 0); break;
	case 7: __asan_report_store1(0); break;
	case 8: __asan_report_store2(0); break;
	case 9: __asan_report_store4(0); break;
	case 10: __asan_report_store8(0); break;
	case 11: __asan_report_store16(0); break;
	case 12: __asan_report_store_n(0, 0); break;
	case 13: __asan_report_exp_load1(0, 0); break;
	case 14: __asan_report_exp_load2(0, 0); break;
	case 15: __asan_report_exp_load4(0, 0); break;
	case 16: __asan_report_exp_load8(0, 0); break;
	case 17: __asan_report_exp_load16(0, 0); break;
	case 18: __asan_report_exp_load_n(0, 0, 0); break;
	case 19: __asan_report_exp_store1(0, 0); break;
	case 20: __asan_report_exp_store2(0, 0); break;
	case 21: __asan_report_exp_store4(0, 0); break;
	case 22: __asan_report_exp_store8(0, 0); break;
	case 23: __asan_report_exp_store16(0, 0); break;
	case 24: __asan_report_exp_store_n(0, 0, 0); break;
	case 25: __asan_register_globals(nullptr, 0); break;
	case 26: __asan_unregister_globals(nullptr, 0); break;
	case 27: __asan_set_death_callback(nullptr); break;
	case 28: __asan_set_error_report_callback(nullptr); break;
	case 29: __asan_handle_no_return(); break;
	case 30: __asan_address_is_poisoned(nullptr); break;
	case 31: __asan_poison_memory_region(nullptr, 0); break;
	case 32: __asan_unpoison_memory_region(nullptr, 0); break;
	case 34: __asan_before_dynamic_init(nullptr); break;
	case 35: __asan_after_dynamic_init(); break;
	case 36: __asan_poison_stack_memory(0, 0); break;
	case 37: __asan_unpoison_stack_memory(0, 0); break;
	case 38: __asan_region_is_poisoned(0, 0); break;
	case 39: __asan_describe_address(0); break;
	case 40: __asan_set_shadow_00(0, 0); break;
	case 41: __asan_set_shadow_f1(0, 0); break;
	case 42: __asan_set_shadow_f2(0, 0); break;
	case 43: __asan_set_shadow_f3(0, 0); break;
	case 44: __asan_set_shadow_f5(0, 0); break;
	case 45: __asan_set_shadow_f8(0, 0); break;
	}
	// clang-format on
	}

	static void asan_atexit() {
	Printf("AddressSanitizer exit stats:\n");
	__asan_print_accumulated_stats();
	// Print AsanMappingProfile.
	for (uptr i = 0; i < kAsanMappingProfileSize; i++) {
	if (AsanMappingProfile[i] == 0) continue;
	Printf("asan_mapping.h:%zd -- %zd\n", i, AsanMappingProfile[i]);
	}
	}

	static void InitializeHighMemEnd() {
	#if !SANITIZER_MYRIAD2
	#if !ASAN_FIXED_MAPPING
	kHighMemEnd = GetMaxUserVirtualAddress();
	// Increase kHighMemEnd to make sure it's properly
	// aligned together with kHighMemBeg:
	kHighMemEnd \|= SHADOW_GRANULARITY * GetMmapGranularity() - 1;
	#endif // !ASAN_FIXED_MAPPING
	CHECK_EQ((kHighMemBeg % GetMmapGranularity()), 0);
	#endif // !SANITIZER_MYRIAD2
	}

	void PrintAddressSpaceLayout() {
	if (kHighMemBeg) {
	Printf("\|\| `[%p, %p]` \|\| HighMem \|\|\n",
	(void)kHighMemBeg, (void)kHighMemEnd);
	Printf("\|\| `[%p, %p]` \|\| HighShadow \|\|\n",
	(void)kHighShadowBeg, (void)kHighShadowEnd);
	}
	if (kMidMemBeg) {
	Printf("\|\| `[%p, %p]` \|\| ShadowGap3 \|\|\n",
	(void)kShadowGap3Beg, (void)kShadowGap3End);
	Printf("\|\| `[%p, %p]` \|\| MidMem \|\|\n",
	(void)kMidMemBeg, (void)kMidMemEnd);
	Printf("\|\| `[%p, %p]` \|\| ShadowGap2 \|\|\n",
	(void)kShadowGap2Beg, (void)kShadowGap2End);
	Printf("\|\| `[%p, %p]` \|\| MidShadow \|\|\n",
	(void)kMidShadowBeg, (void)kMidShadowEnd);
	}
	Printf("\|\| `[%p, %p]` \|\| ShadowGap \|\|\n",
	(void)kShadowGapBeg, (void)kShadowGapEnd);
	if (kLowShadowBeg) {
	Printf("\|\| `[%p, %p]` \|\| LowShadow \|\|\n",
	(void)kLowShadowBeg, (void)kLowShadowEnd);
	Printf("\|\| `[%p, %p]` \|\| LowMem \|\|\n",
	(void)kLowMemBeg, (void)kLowMemEnd);
	}
	Printf("MemToShadow(shadow): %p %p",
	(void*)MEM_TO_SHADOW(kLowShadowBeg),
	(void*)MEM_TO_SHADOW(kLowShadowEnd));
	if (kHighMemBeg) {
	Printf(" %p %p",
	(void*)MEM_TO_SHADOW(kHighShadowBeg),
	(void*)MEM_TO_SHADOW(kHighShadowEnd));
	}
	if (kMidMemBeg) {
	Printf(" %p %p",
	(void*)MEM_TO_SHADOW(kMidShadowBeg),
	(void*)MEM_TO_SHADOW(kMidShadowEnd));
	}
	Printf("\n");
	Printf("redzone=%zu\n", (uptr)flags()->redzone);
	Printf("max_redzone=%zu\n", (uptr)flags()->max_redzone);
	Printf("quarantine_size_mb=%zuM\n", (uptr)flags()->quarantine_size_mb);
	Printf("thread_local_quarantine_size_kb=%zuK\n",
	(uptr)flags()->thread_local_quarantine_size_kb);
	Printf("malloc_context_size=%zu\n",
	(uptr)common_flags()->malloc_context_size);

	Printf("SHADOW_SCALE: %d\n", (int)SHADOW_SCALE);
	Printf("SHADOW_GRANULARITY: %d\n", (int)SHADOW_GRANULARITY);
	Printf("SHADOW_OFFSET: 0x%zx\n", (uptr)SHADOW_OFFSET);
	CHECK(SHADOW_SCALE >= 3 && SHADOW_SCALE <= 7);
	if (kMidMemBeg)
	CHECK(kMidShadowBeg > kLowShadowEnd &&
	kMidMemBeg > kMidShadowEnd &&
	kHighShadowBeg > kMidMemEnd);
	}

	#if defined(__thumb__) && defined(__linux__)
	#define START_BACKGROUND_THREAD_IN_ASAN_INTERNAL
	#endif

	#ifndef START_BACKGROUND_THREAD_IN_ASAN_INTERNAL
	static bool UNUSED __local_asan_dyninit = [] {
	MaybeStartBackgroudThread();
	SetSoftRssLimitExceededCallback(AsanSoftRssLimitExceededCallback);

	return false;
	}();
	#endif

	static void AsanInitInternal() {
	if (LIKELY(asan_inited)) return;
	SanitizerToolName = "AddressSanitizer";
	CHECK(!asan_init_is_running && "ASan init calls itself!");
	asan_init_is_running = true;

	CacheBinaryName();
	- CheckASLR();

	// Initialize flags. This must be done early, because most of the
	// initialization steps look at flags().
	InitializeFlags();

	// Stop performing init at this point if we are being loaded via
	// dlopen() and the platform supports it.
	if (SANITIZER_SUPPORTS_INIT_FOR_DLOPEN && UNLIKELY(HandleDlopenInit())) {
	asan_init_is_running = false;
	VReport(1, "AddressSanitizer init is being performed for dlopen().\n");
	return;
	}

	AsanCheckIncompatibleRT();
	AsanCheckDynamicRTPrereqs();
	AvoidCVE_2016_2143();

	SetCanPoisonMemory(flags()->poison_heap);
	SetMallocContextSize(common_flags()->malloc_context_size);

	InitializePlatformExceptionHandlers();

	InitializeHighMemEnd();

	// Make sure we are not statically linked.
	AsanDoesNotSupportStaticLinkage();

	// Install tool-specific callbacks in sanitizer_common.
	AddDieCallback(AsanDie);
	SetCheckFailedCallback(AsanCheckFailed);
	SetPrintfAndReportCallback(AppendToErrorMessageBuffer);

	__sanitizer_set_report_path(common_flags()->log_path);

	__asan_option_detect_stack_use_after_return =
	flags()->detect_stack_use_after_return;

	__sanitizer::InitializePlatformEarly();

	// Re-exec ourselves if we need to set additional env or command line args.
	MaybeReexec();

	// Setup internal allocator callback.
	SetLowLevelAllocateMinAlignment(SHADOW_GRANULARITY);
	SetLowLevelAllocateCallback(OnLowLevelAllocate);

	InitializeAsanInterceptors();
	+ CheckASLR();

	// Enable system log ("adb logcat") on Android.
	// Doing this before interceptors are initialized crashes in:
	// AsanInitInternal -> android_log_write -> __interceptor_strcmp
	AndroidLogInit();

	ReplaceSystemMalloc();

	DisableCoreDumperIfNecessary();

	InitializeShadowMemory();

	AsanTSDInit(PlatformTSDDtor);
	InstallDeadlySignalHandlers(AsanOnDeadlySignal);

	AllocatorOptions allocator_options;
	allocator_options.SetFrom(flags(), common_flags());
	InitializeAllocator(allocator_options);

	#ifdef START_BACKGROUND_THREAD_IN_ASAN_INTERNAL
	MaybeStartBackgroudThread();
	SetSoftRssLimitExceededCallback(AsanSoftRssLimitExceededCallback);
	#endif

	// On Linux AsanThread::ThreadStart() calls malloc() that's why asan_inited
	// should be set to 1 prior to initializing the threads.
	asan_inited = 1;
	asan_init_is_running = false;

	if (flags()->atexit)
	Atexit(asan_atexit);

	InitializeCoverage(common_flags()->coverage, common_flags()->coverage_dir);

	// Now that ASan runtime is (mostly) initialized, deactivate it if
	// necessary, so that it can be re-activated when requested.
	if (flags()->start_deactivated)
	AsanDeactivate();

	// interceptors
	InitTlsSize();

	// Create main thread.
	AsanThread *main_thread = CreateMainThread();
	CHECK_EQ(0, main_thread->tid());
	force_interface_symbols(); // no-op.
	SanitizerInitializeUnwinder();

	if (CAN_SANITIZE_LEAKS) {
	__lsan::InitCommonLsan();
	if (common_flags()->detect_leaks && common_flags()->leak_check_at_exit) {
	if (flags()->halt_on_error)
	Atexit(__lsan::DoLeakCheck);
	else
	Atexit(__lsan::DoRecoverableLeakCheckVoid);
	}
	}

	#if CAN_SANITIZE_UB
	__ubsan::InitAsPlugin();
	#endif

	InitializeSuppressions();

	if (CAN_SANITIZE_LEAKS) {
	// LateInitialize() calls dlsym, which can allocate an error string buffer
	// in the TLS. Let's ignore the allocation to avoid reporting a leak.
	__lsan::ScopedInterceptorDisabler disabler;
	Symbolizer::LateInitialize();
	} else {
	Symbolizer::LateInitialize();
	}

	VReport(1, "AddressSanitizer Init done\n");

	if (flags()->sleep_after_init) {
	Report("Sleeping for %d second(s)\n", flags()->sleep_after_init);
	SleepForSeconds(flags()->sleep_after_init);
	}
	}

	// Initialize as requested from some part of ASan runtime library (interceptors,
	// allocator, etc).
	void AsanInitFromRtl() {
	AsanInitInternal();
	}

	#if ASAN_DYNAMIC
	// Initialize runtime in case it's LD_PRELOAD-ed into unsanitized executable
	// (and thus normal initializers from .preinit_array or modules haven't run).

	class AsanInitializer {
	public: // NOLINT
	AsanInitializer() {
	AsanInitFromRtl();
	}
	};

	static AsanInitializer asan_initializer;
	#endif // ASAN_DYNAMIC

	} // namespace __asan

	// ---------------------- Interface ---------------- {{{1
	using namespace __asan; // NOLINT

	void NOINLINE __asan_handle_no_return() {
	if (asan_init_is_running)
	return;

	int local_stack;
	AsanThread *curr_thread = GetCurrentThread();
	uptr PageSize = GetPageSizeCached();
	uptr top, bottom;
	if (curr_thread) {
	top = curr_thread->stack_top();
	bottom = ((uptr)&local_stack - PageSize) & ~(PageSize - 1);
	} else if (SANITIZER_RTEMS) {
	// Give up On RTEMS.
	return;
	} else {
	CHECK(!SANITIZER_FUCHSIA);
	// If we haven't seen this thread, try asking the OS for stack bounds.
	uptr tls_addr, tls_size, stack_size;
	GetThreadStackAndTls(/main=/false, &bottom, &stack_size, &tls_addr,
	&tls_size);
	top = bottom + stack_size;
	}
	static const uptr kMaxExpectedCleanupSize = 64 << 20; // 64M
	if (top - bottom > kMaxExpectedCleanupSize) {
	static bool reported_warning = false;
	if (reported_warning)
	return;
	reported_warning = true;
	Report("WARNING: ASan is ignoring requested __asan_handle_no_return: "
	"stack top: %p; bottom %p; size: %p (%zd)\n"
	"False positive error reports may follow\n"
	"For details see "
	"https://github.com/google/sanitizers/issues/189\n",
	top, bottom, top - bottom, top - bottom);
	return;
	}
	PoisonShadow(bottom, top - bottom, 0);
	if (curr_thread && curr_thread->has_fake_stack())
	curr_thread->fake_stack()->HandleNoReturn();
	}

	extern "C" void *__asan_extra_spill_area() {
	AsanThread *t = GetCurrentThread();
	CHECK(t);
	return t->extra_spill_area();
	}

	void __asan_handle_vfork(void *sp) {
	AsanThread *t = GetCurrentThread();
	CHECK(t);
	uptr bottom = t->stack_bottom();
	PoisonShadow(bottom, (uptr)sp - bottom, 0);
	}

	void NOINLINE __asan_set_death_callback(void (*callback)(void)) {
	SetUserDieCallback(callback);
	}

	// Initialize as requested from instrumented application code.
	// We use this call as a trigger to wake up ASan from deactivated state.
	void __asan_init() {
	AsanActivate();
	AsanInitInternal();
	}

	void __asan_version_mismatch_check() {
	// Do nothing.
	}
	Index: projects/clang900-import/contrib/compiler-rt/lib/msan/msan.cc
	===================================================================
	--- projects/clang900-import/contrib/compiler-rt/lib/msan/msan.cc (revision 351721)
	+++ projects/clang900-import/contrib/compiler-rt/lib/msan/msan.cc (revision 351722)
	@@ -1,675 +1,675 @@
	//===-- msan.cc -----------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file is a part of MemorySanitizer.
	//
	// MemorySanitizer runtime.
	//===----------------------------------------------------------------------===//

	#include "msan.h"
	#include "msan_chained_origin_depot.h"
	#include "msan_origin.h"
	#include "msan_report.h"
	#include "msan_thread.h"
	#include "msan_poisoning.h"
	#include "sanitizer_common/sanitizer_atomic.h"
	#include "sanitizer_common/sanitizer_common.h"
	#include "sanitizer_common/sanitizer_flags.h"
	#include "sanitizer_common/sanitizer_flag_parser.h"
	#include "sanitizer_common/sanitizer_libc.h"
	#include "sanitizer_common/sanitizer_procmaps.h"
	#include "sanitizer_common/sanitizer_stacktrace.h"
	#include "sanitizer_common/sanitizer_symbolizer.h"
	#include "sanitizer_common/sanitizer_stackdepot.h"
	#include "ubsan/ubsan_flags.h"
	#include "ubsan/ubsan_init.h"

	// ACHTUNG! No system header includes in this file.

	using namespace __sanitizer;

	// Globals.
	static THREADLOCAL int msan_expect_umr = 0;
	static THREADLOCAL int msan_expected_umr_found = 0;

	// Function argument shadow. Each argument starts at the next available 8-byte
	// aligned address.
	SANITIZER_INTERFACE_ATTRIBUTE
	THREADLOCAL u64 __msan_param_tls[kMsanParamTlsSize / sizeof(u64)];

	// Function argument origin. Each argument starts at the same offset as the
	// corresponding shadow in (__msan_param_tls). Slightly weird, but changing this
	// would break compatibility with older prebuilt binaries.
	SANITIZER_INTERFACE_ATTRIBUTE
	THREADLOCAL u32 __msan_param_origin_tls[kMsanParamTlsSize / sizeof(u32)];

	SANITIZER_INTERFACE_ATTRIBUTE
	THREADLOCAL u64 __msan_retval_tls[kMsanRetvalTlsSize / sizeof(u64)];

	SANITIZER_INTERFACE_ATTRIBUTE
	THREADLOCAL u32 __msan_retval_origin_tls;

	SANITIZER_INTERFACE_ATTRIBUTE
	ALIGNED(16) THREADLOCAL u64 __msan_va_arg_tls[kMsanParamTlsSize / sizeof(u64)];

	SANITIZER_INTERFACE_ATTRIBUTE
	ALIGNED(16)
	THREADLOCAL u32 __msan_va_arg_origin_tls[kMsanParamTlsSize / sizeof(u32)];

	SANITIZER_INTERFACE_ATTRIBUTE
	THREADLOCAL u64 __msan_va_arg_overflow_size_tls;

	SANITIZER_INTERFACE_ATTRIBUTE
	THREADLOCAL u32 __msan_origin_tls;

	static THREADLOCAL int is_in_symbolizer;

	extern "C" SANITIZER_WEAK_ATTRIBUTE const int __msan_track_origins;

	int __msan_get_track_origins() {
	return &__msan_track_origins ? __msan_track_origins : 0;
	}

	extern "C" SANITIZER_WEAK_ATTRIBUTE const int __msan_keep_going;

	namespace __msan {

	void EnterSymbolizer() { ++is_in_symbolizer; }
	void ExitSymbolizer() { --is_in_symbolizer; }
	bool IsInSymbolizer() { return is_in_symbolizer; }

	static Flags msan_flags;

	Flags *flags() {
	return &msan_flags;
	}

	int msan_inited = 0;
	bool msan_init_is_running;

	int msan_report_count = 0;

	// Array of stack origins.
	// FIXME: make it resizable.
	static const uptr kNumStackOriginDescrs = 1024 * 1024;
	static const char *StackOriginDescr[kNumStackOriginDescrs];
	static uptr StackOriginPC[kNumStackOriginDescrs];
	static atomic_uint32_t NumStackOriginDescrs;

	void Flags::SetDefaults() {
	#define MSAN_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
	#include "msan_flags.inc"
	#undef MSAN_FLAG
	}

	// keep_going is an old name for halt_on_error,
	// and it has inverse meaning.
	class FlagHandlerKeepGoing : public FlagHandlerBase {
	bool *halt_on_error_;

	public:
	explicit FlagHandlerKeepGoing(bool *halt_on_error)
	: halt_on_error_(halt_on_error) {}
	bool Parse(const char *value) final {
	bool tmp;
	FlagHandler<bool> h(&tmp);
	if (!h.Parse(value)) return false;
	*halt_on_error_ = !tmp;
	return true;
	}
	};

	static void RegisterMsanFlags(FlagParser parser, Flags f) {
	#define MSAN_FLAG(Type, Name, DefaultValue, Description) \
	RegisterFlag(parser, #Name, Description, &f->Name);
	#include "msan_flags.inc"
	#undef MSAN_FLAG

	FlagHandlerKeepGoing *fh_keep_going = new (FlagParser::Alloc) // NOLINT
	FlagHandlerKeepGoing(&f->halt_on_error);
	parser->RegisterHandler("keep_going", fh_keep_going,
	"deprecated, use halt_on_error");
	}

	static void InitializeFlags() {
	SetCommonFlagsDefaults();
	{
	CommonFlags cf;
	cf.CopyFrom(*common_flags());
	cf.external_symbolizer_path = GetEnv("MSAN_SYMBOLIZER_PATH");
	cf.malloc_context_size = 20;
	cf.handle_ioctl = true;
	// FIXME: test and enable.
	cf.check_printf = false;
	cf.intercept_tls_get_addr = true;
	cf.exitcode = 77;
	OverrideCommonFlags(cf);
	}

	Flags *f = flags();
	f->SetDefaults();

	FlagParser parser;
	RegisterMsanFlags(&parser, f);
	RegisterCommonFlags(&parser);

	#if MSAN_CONTAINS_UBSAN
	__ubsan::Flags *uf = __ubsan::flags();
	uf->SetDefaults();

	FlagParser ubsan_parser;
	__ubsan::RegisterUbsanFlags(&ubsan_parser, uf);
	RegisterCommonFlags(&ubsan_parser);
	#endif

	// Override from user-specified string.
	if (__msan_default_options)
	parser.ParseString(__msan_default_options());
	#if MSAN_CONTAINS_UBSAN
	const char *ubsan_default_options = __ubsan::MaybeCallUbsanDefaultOptions();
	ubsan_parser.ParseString(ubsan_default_options);
	#endif

	parser.ParseStringFromEnv("MSAN_OPTIONS");
	#if MSAN_CONTAINS_UBSAN
	ubsan_parser.ParseStringFromEnv("UBSAN_OPTIONS");
	#endif

	InitializeCommonFlags();

	if (Verbosity()) ReportUnrecognizedFlags();

	if (common_flags()->help) parser.PrintFlagDescriptions();

	// Check if deprecated exit_code MSan flag is set.
	if (f->exit_code != -1) {
	if (Verbosity())
	Printf("MSAN_OPTIONS=exit_code is deprecated! "
	"Please use MSAN_OPTIONS=exitcode instead.\n");
	CommonFlags cf;
	cf.CopyFrom(*common_flags());
	cf.exitcode = f->exit_code;
	OverrideCommonFlags(cf);
	}

	// Check flag values:
	if (f->origin_history_size < 0 \|\|
	f->origin_history_size > Origin::kMaxDepth) {
	Printf(
	"Origin history size invalid: %d. Must be 0 (unlimited) or in [1, %d] "
	"range.\n",
	f->origin_history_size, Origin::kMaxDepth);
	Die();
	}
	// Limiting to kStackDepotMaxUseCount / 2 to avoid overflow in
	// StackDepotHandle::inc_use_count_unsafe.
	if (f->origin_history_per_stack_limit < 0 \|\|
	f->origin_history_per_stack_limit > kStackDepotMaxUseCount / 2) {
	Printf(
	"Origin per-stack limit invalid: %d. Must be 0 (unlimited) or in [1, "
	"%d] range.\n",
	f->origin_history_per_stack_limit, kStackDepotMaxUseCount / 2);
	Die();
	}
	if (f->store_context_size < 1) f->store_context_size = 1;
	}

	void PrintWarning(uptr pc, uptr bp) {
	PrintWarningWithOrigin(pc, bp, __msan_origin_tls);
	}

	void PrintWarningWithOrigin(uptr pc, uptr bp, u32 origin) {
	if (msan_expect_umr) {
	// Printf("Expected UMR\n");
	__msan_origin_tls = origin;
	msan_expected_umr_found = 1;
	return;
	}

	++msan_report_count;

	GET_FATAL_STACK_TRACE_PC_BP(pc, bp);

	u32 report_origin =
	(__msan_get_track_origins() && Origin::isValidId(origin)) ? origin : 0;
	ReportUMR(&stack, report_origin);

	if (__msan_get_track_origins() && !Origin::isValidId(origin)) {
	Printf(
	" ORIGIN: invalid (%x). Might be a bug in MemorySanitizer origin "
	"tracking.\n This could still be a bug in your code, too!\n",
	origin);
	}
	}

	void UnpoisonParam(uptr n) {
	internal_memset(__msan_param_tls, 0, n * sizeof(*__msan_param_tls));
	}

	// Backup MSan runtime TLS state.
	// Implementation must be async-signal-safe.
	// Instances of this class may live on the signal handler stack, and data size
	// may be an issue.
	void ScopedThreadLocalStateBackup::Backup() {
	va_arg_overflow_size_tls = __msan_va_arg_overflow_size_tls;
	}

	void ScopedThreadLocalStateBackup::Restore() {
	// A lame implementation that only keeps essential state and resets the rest.
	__msan_va_arg_overflow_size_tls = va_arg_overflow_size_tls;

	internal_memset(__msan_param_tls, 0, sizeof(__msan_param_tls));
	internal_memset(__msan_retval_tls, 0, sizeof(__msan_retval_tls));
	internal_memset(__msan_va_arg_tls, 0, sizeof(__msan_va_arg_tls));
	internal_memset(__msan_va_arg_origin_tls, 0,
	sizeof(__msan_va_arg_origin_tls));

	if (__msan_get_track_origins()) {
	internal_memset(&__msan_retval_origin_tls, 0,
	sizeof(__msan_retval_origin_tls));
	internal_memset(__msan_param_origin_tls, 0,
	sizeof(__msan_param_origin_tls));
	}
	}

	void UnpoisonThreadLocalState() {
	}

	const char GetStackOriginDescr(u32 id, uptr pc) {
	CHECK_LT(id, kNumStackOriginDescrs);
	if (pc) *pc = StackOriginPC[id];
	return StackOriginDescr[id];
	}

	u32 ChainOrigin(u32 id, StackTrace *stack) {
	MsanThread *t = GetCurrentThread();
	if (t && t->InSignalHandler())
	return id;

	Origin o = Origin::FromRawId(id);
	stack->tag = StackTrace::TAG_UNKNOWN;
	Origin chained = Origin::CreateChainedOrigin(o, stack);
	return chained.raw_id();
	}

	} // namespace __msan

	void __sanitizer::BufferedStackTrace::UnwindImpl(
	uptr pc, uptr bp, void *context, bool request_fast, u32 max_depth) {
	using namespace __msan;
	MsanThread *t = GetCurrentThread();
	if (!t \|\| !StackTrace::WillUseFastUnwind(request_fast)) {
	// Block reports from our interceptors during _Unwind_Backtrace.
	SymbolizerScope sym_scope;
	return Unwind(max_depth, pc, bp, context, 0, 0, false);
	}
	if (StackTrace::WillUseFastUnwind(request_fast))
	Unwind(max_depth, pc, bp, nullptr, t->stack_top(), t->stack_bottom(), true);
	else
	Unwind(max_depth, pc, 0, context, 0, 0, false);
	}

	// Interface.

	using namespace __msan;

	#define MSAN_MAYBE_WARNING(type, size) \
	void __msan_maybe_warning_##size(type s, u32 o) { \
	GET_CALLER_PC_BP_SP; \
	(void) sp; \
	if (UNLIKELY(s)) { \
	PrintWarningWithOrigin(pc, bp, o); \
	if (__msan::flags()->halt_on_error) { \
	Printf("Exiting\n"); \
	Die(); \
	} \
	} \
	}

	MSAN_MAYBE_WARNING(u8, 1)
	MSAN_MAYBE_WARNING(u16, 2)
	MSAN_MAYBE_WARNING(u32, 4)
	MSAN_MAYBE_WARNING(u64, 8)

	#define MSAN_MAYBE_STORE_ORIGIN(type, size) \
	void __msan_maybe_store_origin_##size(type s, void *p, u32 o) { \
	if (UNLIKELY(s)) { \
	if (__msan_get_track_origins() > 1) { \
	GET_CALLER_PC_BP_SP; \
	(void) sp; \
	GET_STORE_STACK_TRACE_PC_BP(pc, bp); \
	o = ChainOrigin(o, &stack); \
	} \
	(u32 )MEM_TO_ORIGIN((uptr)p & ~3UL) = o; \
	} \
	}

	MSAN_MAYBE_STORE_ORIGIN(u8, 1)
	MSAN_MAYBE_STORE_ORIGIN(u16, 2)
	MSAN_MAYBE_STORE_ORIGIN(u32, 4)
	MSAN_MAYBE_STORE_ORIGIN(u64, 8)

	void __msan_warning() {
	GET_CALLER_PC_BP_SP;
	(void)sp;
	PrintWarning(pc, bp);
	if (__msan::flags()->halt_on_error) {
	if (__msan::flags()->print_stats)
	ReportStats();
	Printf("Exiting\n");
	Die();
	}
	}

	void __msan_warning_noreturn() {
	GET_CALLER_PC_BP_SP;
	(void)sp;
	PrintWarning(pc, bp);
	if (__msan::flags()->print_stats)
	ReportStats();
	Printf("Exiting\n");
	Die();
	}

	static void OnStackUnwind(const SignalContext &sig, const void *,
	BufferedStackTrace *stack) {
	stack->Unwind(sig.pc, sig.bp, sig.context,
	common_flags()->fast_unwind_on_fatal);
	}

	static void MsanOnDeadlySignal(int signo, void siginfo, void context) {
	HandleDeadlySignal(siginfo, context, GetTid(), &OnStackUnwind, nullptr);
	}

	static void MsanCheckFailed(const char file, int line, const char cond,
	u64 v1, u64 v2) {
	Report("MemorySanitizer CHECK failed: %s:%d \"%s\" (0x%zx, 0x%zx)\n", file,
	line, cond, (uptr)v1, (uptr)v2);
	PRINT_CURRENT_STACK_CHECK();
	Die();
	}

	void __msan_init() {
	CHECK(!msan_init_is_running);
	if (msan_inited) return;
	msan_init_is_running = 1;
	SanitizerToolName = "MemorySanitizer";

	AvoidCVE_2016_2143();

	CacheBinaryName();
	- CheckASLR();
	InitializeFlags();

	// Install tool-specific callbacks in sanitizer_common.
	SetCheckFailedCallback(MsanCheckFailed);

	__sanitizer_set_report_path(common_flags()->log_path);

	InitializeInterceptors();
	+ CheckASLR();
	InitTlsSize();
	InstallDeadlySignalHandlers(MsanOnDeadlySignal);
	InstallAtExitHandler(); // Needs __cxa_atexit interceptor.

	DisableCoreDumperIfNecessary();
	if (StackSizeIsUnlimited()) {
	VPrintf(1, "Unlimited stack, doing reexec\n");
	// A reasonably large stack size. It is bigger than the usual 8Mb, because,
	// well, the program could have been run with unlimited stack for a reason.
	SetStackSizeLimitInBytes(32 * 1024 * 1024);
	ReExec();
	}

	__msan_clear_on_return();
	if (__msan_get_track_origins())
	VPrintf(1, "msan_track_origins\n");
	if (!InitShadow(__msan_get_track_origins())) {
	Printf("FATAL: MemorySanitizer can not mmap the shadow memory.\n");
	Printf("FATAL: Make sure to compile with -fPIE and to link with -pie.\n");
	Printf("FATAL: Disabling ASLR is known to cause this error.\n");
	Printf("FATAL: If running under GDB, try "
	"'set disable-randomization off'.\n");
	DumpProcessMap();
	Die();
	}

	Symbolizer::GetOrInit()->AddHooks(EnterSymbolizer, ExitSymbolizer);

	InitializeCoverage(common_flags()->coverage, common_flags()->coverage_dir);

	MsanTSDInit(MsanTSDDtor);

	MsanAllocatorInit();

	MsanThread *main_thread = MsanThread::Create(nullptr, nullptr);
	SetCurrentThread(main_thread);
	main_thread->ThreadStart();

	#if MSAN_CONTAINS_UBSAN
	__ubsan::InitAsPlugin();
	#endif

	VPrintf(1, "MemorySanitizer init done\n");

	msan_init_is_running = 0;
	msan_inited = 1;
	}

	void __msan_set_keep_going(int keep_going) {
	flags()->halt_on_error = !keep_going;
	}

	void __msan_set_expect_umr(int expect_umr) {
	if (expect_umr) {
	msan_expected_umr_found = 0;
	} else if (!msan_expected_umr_found) {
	GET_CALLER_PC_BP_SP;
	(void)sp;
	GET_FATAL_STACK_TRACE_PC_BP(pc, bp);
	ReportExpectedUMRNotFound(&stack);
	Die();
	}
	msan_expect_umr = expect_umr;
	}

	void __msan_print_shadow(const void *x, uptr size) {
	if (!MEM_IS_APP(x)) {
	Printf("Not a valid application address: %p\n", x);
	return;
	}

	DescribeMemoryRange(x, size);
	}

	void __msan_dump_shadow(const void *x, uptr size) {
	if (!MEM_IS_APP(x)) {
	Printf("Not a valid application address: %p\n", x);
	return;
	}

	unsigned char s = (unsigned char)MEM_TO_SHADOW(x);
	for (uptr i = 0; i < size; i++)
	Printf("%x%x ", s[i] >> 4, s[i] & 0xf);
	Printf("\n");
	}

	sptr __msan_test_shadow(const void *x, uptr size) {
	if (!MEM_IS_APP(x)) return -1;
	unsigned char s = (unsigned char )MEM_TO_SHADOW((uptr)x);
	for (uptr i = 0; i < size; ++i)
	if (s[i])
	return i;
	return -1;
	}

	void __msan_check_mem_is_initialized(const void *x, uptr size) {
	if (!__msan::flags()->report_umrs) return;
	sptr offset = __msan_test_shadow(x, size);
	if (offset < 0)
	return;

	GET_CALLER_PC_BP_SP;
	(void)sp;
	ReportUMRInsideAddressRange(__func__, x, size, offset);
	__msan::PrintWarningWithOrigin(pc, bp,
	__msan_get_origin(((const char *)x) + offset));
	if (__msan::flags()->halt_on_error) {
	Printf("Exiting\n");
	Die();
	}
	}

	int __msan_set_poison_in_malloc(int do_poison) {
	int old = flags()->poison_in_malloc;
	flags()->poison_in_malloc = do_poison;
	return old;
	}

	int __msan_has_dynamic_component() { return false; }

	NOINLINE
	void __msan_clear_on_return() {
	__msan_param_tls[0] = 0;
	}

	void __msan_partial_poison(const void* data, void* shadow, uptr size) {
	internal_memcpy((void*)MEM_TO_SHADOW((uptr)data), shadow, size);
	}

	void __msan_load_unpoisoned(const void src, uptr size, void dst) {
	internal_memcpy(dst, src, size);
	__msan_unpoison(dst, size);
	}

	void __msan_set_origin(const void *a, uptr size, u32 origin) {
	if (__msan_get_track_origins()) SetOrigin(a, size, origin);
	}

	// 'descr' is created at compile time and contains '----' in the beginning.
	// When we see descr for the first time we replace '----' with a uniq id
	// and set the origin to (id \| (31-th bit)).
	void __msan_set_alloca_origin(void a, uptr size, char descr) {
	__msan_set_alloca_origin4(a, size, descr, 0);
	}

	void __msan_set_alloca_origin4(void a, uptr size, char descr, uptr pc) {
	static const u32 dash = '-';
	static const u32 first_timer =
	dash + (dash << 8) + (dash << 16) + (dash << 24);
	u32 id_ptr = (u32)descr;
	bool print = false; // internal_strstr(descr + 4, "AllocaTOTest") != 0;
	u32 id = *id_ptr;
	if (id == first_timer) {
	u32 idx = atomic_fetch_add(&NumStackOriginDescrs, 1, memory_order_relaxed);
	CHECK_LT(idx, kNumStackOriginDescrs);
	StackOriginDescr[idx] = descr + 4;
	#if SANITIZER_PPC64V1
	// On PowerPC64 ELFv1, the address of a function actually points to a
	// three-doubleword data structure with the first field containing
	// the address of the function's code.
	if (pc)
	pc = reinterpret_cast<uptr>(pc);
	#endif
	StackOriginPC[idx] = pc;
	id = Origin::CreateStackOrigin(idx).raw_id();
	*id_ptr = id;
	if (print)
	Printf("First time: idx=%d id=%d %s %p \n", idx, id, descr + 4, pc);
	}
	if (print)
	Printf("__msan_set_alloca_origin: descr=%s id=%x\n", descr + 4, id);
	__msan_set_origin(a, size, id);
	}

	u32 __msan_chain_origin(u32 id) {
	GET_CALLER_PC_BP_SP;
	(void)sp;
	GET_STORE_STACK_TRACE_PC_BP(pc, bp);
	return ChainOrigin(id, &stack);
	}

	u32 __msan_get_origin(const void *a) {
	if (!__msan_get_track_origins()) return 0;
	uptr x = (uptr)a;
	uptr aligned = x & ~3ULL;
	uptr origin_ptr = MEM_TO_ORIGIN(aligned);
	return (u32)origin_ptr;
	}

	int __msan_origin_is_descendant_or_same(u32 this_id, u32 prev_id) {
	Origin o = Origin::FromRawId(this_id);
	while (o.raw_id() != prev_id && o.isChainedOrigin())
	o = o.getNextChainedOrigin(nullptr);
	return o.raw_id() == prev_id;
	}

	u32 __msan_get_umr_origin() {
	return __msan_origin_tls;
	}

	u16 __sanitizer_unaligned_load16(const uu16 *p) {
	(uu16 )&__msan_retval_tls[0] = (uu16 )MEM_TO_SHADOW((uptr)p);
	if (__msan_get_track_origins())
	__msan_retval_origin_tls = GetOriginIfPoisoned((uptr)p, sizeof(*p));
	return *p;
	}
	u32 __sanitizer_unaligned_load32(const uu32 *p) {
	(uu32 )&__msan_retval_tls[0] = (uu32 )MEM_TO_SHADOW((uptr)p);
	if (__msan_get_track_origins())
	__msan_retval_origin_tls = GetOriginIfPoisoned((uptr)p, sizeof(*p));
	return *p;
	}
	u64 __sanitizer_unaligned_load64(const uu64 *p) {
	__msan_retval_tls[0] = (uu64 )MEM_TO_SHADOW((uptr)p);
	if (__msan_get_track_origins())
	__msan_retval_origin_tls = GetOriginIfPoisoned((uptr)p, sizeof(*p));
	return *p;
	}
	void __sanitizer_unaligned_store16(uu16 *p, u16 x) {
	u16 s = (uu16 )&__msan_param_tls[1];
	(uu16 )MEM_TO_SHADOW((uptr)p) = s;
	if (s && __msan_get_track_origins())
	if (uu32 o = __msan_param_origin_tls[2])
	SetOriginIfPoisoned((uptr)p, (uptr)&s, sizeof(s), o);
	*p = x;
	}
	void __sanitizer_unaligned_store32(uu32 *p, u32 x) {
	u32 s = (uu32 )&__msan_param_tls[1];
	(uu32 )MEM_TO_SHADOW((uptr)p) = s;
	if (s && __msan_get_track_origins())
	if (uu32 o = __msan_param_origin_tls[2])
	SetOriginIfPoisoned((uptr)p, (uptr)&s, sizeof(s), o);
	*p = x;
	}
	void __sanitizer_unaligned_store64(uu64 *p, u64 x) {
	u64 s = __msan_param_tls[1];
	(uu64 )MEM_TO_SHADOW((uptr)p) = s;
	if (s && __msan_get_track_origins())
	if (uu32 o = __msan_param_origin_tls[2])
	SetOriginIfPoisoned((uptr)p, (uptr)&s, sizeof(s), o);
	*p = x;
	}

	void __msan_set_death_callback(void (*callback)(void)) {
	SetUserDieCallback(callback);
	}

	#if !SANITIZER_SUPPORTS_WEAK_HOOKS
	extern "C" {
	SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
	const char* __msan_default_options() { return ""; }
	} // extern "C"
	#endif

	extern "C" {
	SANITIZER_INTERFACE_ATTRIBUTE
	void __sanitizer_print_stack_trace() {
	GET_FATAL_STACK_TRACE_PC_BP(StackTrace::GetCurrentPc(), GET_CURRENT_FRAME());
	stack.Print();
	}
	} // extern "C"
	Index: projects/clang900-import/contrib/compiler-rt/lib/sanitizer_common/sanitizer_linux.cc
	===================================================================
	--- projects/clang900-import/contrib/compiler-rt/lib/sanitizer_common/sanitizer_linux.cc (revision 351721)
	+++ projects/clang900-import/contrib/compiler-rt/lib/sanitizer_common/sanitizer_linux.cc (revision 351722)
	@@ -1,2106 +1,2135 @@
	//===-- sanitizer_linux.cc ------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file is shared between AddressSanitizer and ThreadSanitizer
	// run-time libraries and implements linux-specific functions from
	// sanitizer_libc.h.
	//===----------------------------------------------------------------------===//

	#include "sanitizer_platform.h"

	#if SANITIZER_FREEBSD \|\| SANITIZER_LINUX \|\| SANITIZER_NETBSD \|\| \
	SANITIZER_OPENBSD \|\| SANITIZER_SOLARIS

	#include "sanitizer_common.h"
	#include "sanitizer_flags.h"
	#include "sanitizer_getauxval.h"
	#include "sanitizer_internal_defs.h"
	#include "sanitizer_libc.h"
	#include "sanitizer_linux.h"
	#include "sanitizer_mutex.h"
	#include "sanitizer_placement_new.h"
	#include "sanitizer_procmaps.h"

	#if SANITIZER_LINUX
	#include <asm/param.h>
	#endif

	// For mips64, syscall(__NR_stat) fills the buffer in the 'struct kernel_stat'
	// format. Struct kernel_stat is defined as 'struct stat' in asm/stat.h. To
	// access stat from asm/stat.h, without conflicting with definition in
	// sys/stat.h, we use this trick.
	#if defined(__mips64)
	#include <asm/unistd.h>
	#include <sys/types.h>
	#define stat kernel_stat
	#include <asm/stat.h>
	#undef stat
	#endif

	#include <dlfcn.h>
	#include <errno.h>
	#include <fcntl.h>
	#include <link.h>
	#include <pthread.h>
	#include <sched.h>
	#include <signal.h>
	#include <sys/mman.h>
	#include <sys/param.h>
	#if !SANITIZER_SOLARIS
	#include <sys/ptrace.h>
	#endif
	#include <sys/resource.h>
	#include <sys/stat.h>
	#include <sys/syscall.h>
	#include <sys/time.h>
	#include <sys/types.h>
	#if !SANITIZER_OPENBSD
	#include <ucontext.h>
	#endif
	#if SANITIZER_OPENBSD
	#include <sys/futex.h>
	#include <sys/sysctl.h>
	#endif
	#include <unistd.h>

	#if SANITIZER_LINUX
	#include <sys/utsname.h>
	#endif

	#if SANITIZER_LINUX && !SANITIZER_ANDROID
	#include <sys/personality.h>
	#endif

	#if SANITIZER_FREEBSD
	#include <sys/exec.h>
	#include <sys/sysctl.h>
	#include <machine/atomic.h>
	extern "C" {
	// <sys/umtx.h> must be included after <errno.h> and <sys/types.h> on
	// FreeBSD 9.2 and 10.0.
	#include <sys/umtx.h>
	}
	#include <sys/thr.h>
	#endif // SANITIZER_FREEBSD

	#if SANITIZER_NETBSD
	#include <limits.h> // For NAME_MAX
	#include <sys/sysctl.h>
	#include <sys/exec.h>
	extern struct ps_strings *__ps_strings;
	#endif // SANITIZER_NETBSD

	#if SANITIZER_SOLARIS
	#include <stdlib.h>
	#include <thread.h>
	#define environ _environ
	#endif

	extern char **environ;

	#if SANITIZER_LINUX
	// <linux/time.h>
	struct kernel_timeval {
	long tv_sec;
	long tv_usec;
	};

	// <linux/futex.h> is broken on some linux distributions.
	const int FUTEX_WAIT = 0;
	const int FUTEX_WAKE = 1;
	const int FUTEX_PRIVATE_FLAG = 128;
	const int FUTEX_WAIT_PRIVATE = FUTEX_WAIT \| FUTEX_PRIVATE_FLAG;
	const int FUTEX_WAKE_PRIVATE = FUTEX_WAKE \| FUTEX_PRIVATE_FLAG;
	#endif // SANITIZER_LINUX

	// Are we using 32-bit or 64-bit Linux syscalls?
	// x32 (which defines __x86_64__) has SANITIZER_WORDSIZE == 32
	// but it still needs to use 64-bit syscalls.
	#if SANITIZER_LINUX && (defined(__x86_64__) \|\| defined(__powerpc64__) \|\| \
	SANITIZER_WORDSIZE == 64)
	# define SANITIZER_LINUX_USES_64BIT_SYSCALLS 1
	#else
	# define SANITIZER_LINUX_USES_64BIT_SYSCALLS 0
	#endif

	// Note : FreeBSD had implemented both
	// Linux and OpenBSD apis, available from
	// future 12.x version most likely
	#if SANITIZER_LINUX && defined(__NR_getrandom)
	# if !defined(GRND_NONBLOCK)
	# define GRND_NONBLOCK 1
	# endif
	# define SANITIZER_USE_GETRANDOM 1
	#else
	# define SANITIZER_USE_GETRANDOM 0
	#endif // SANITIZER_LINUX && defined(__NR_getrandom)

	#if SANITIZER_OPENBSD
	# define SANITIZER_USE_GETENTROPY 1
	#else
	# if SANITIZER_FREEBSD && __FreeBSD_version >= 1200000
	# define SANITIZER_USE_GETENTROPY 1
	# else
	# define SANITIZER_USE_GETENTROPY 0
	# endif
	#endif // SANITIZER_USE_GETENTROPY

	namespace __sanitizer {

	#if SANITIZER_LINUX && defined(__x86_64__)
	#include "sanitizer_syscall_linux_x86_64.inc"
	#elif SANITIZER_LINUX && defined(__aarch64__)
	#include "sanitizer_syscall_linux_aarch64.inc"
	#elif SANITIZER_LINUX && defined(__arm__)
	#include "sanitizer_syscall_linux_arm.inc"
	#else
	#include "sanitizer_syscall_generic.inc"
	#endif

	// --------------- sanitizer_libc.h
	#if !SANITIZER_SOLARIS && !SANITIZER_NETBSD
	#if !SANITIZER_S390 && !SANITIZER_OPENBSD
	uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
	OFF_T offset) {
	#if SANITIZER_FREEBSD \|\| SANITIZER_LINUX_USES_64BIT_SYSCALLS
	return internal_syscall(SYSCALL(mmap), (uptr)addr, length, prot, flags, fd,
	offset);
	#else
	// mmap2 specifies file offset in 4096-byte units.
	CHECK(IsAligned(offset, 4096));
	return internal_syscall(SYSCALL(mmap2), addr, length, prot, flags, fd,
	offset / 4096);
	#endif
	}
	#endif // !SANITIZER_S390 && !SANITIZER_OPENBSD

	#if !SANITIZER_OPENBSD
	uptr internal_munmap(void *addr, uptr length) {
	return internal_syscall(SYSCALL(munmap), (uptr)addr, length);
	}

	int internal_mprotect(void *addr, uptr length, int prot) {
	return internal_syscall(SYSCALL(mprotect), (uptr)addr, length, prot);
	}
	#endif

	uptr internal_close(fd_t fd) {
	return internal_syscall(SYSCALL(close), fd);
	}

	uptr internal_open(const char *filename, int flags) {
	#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	return internal_syscall(SYSCALL(openat), AT_FDCWD, (uptr)filename, flags);
	#else
	return internal_syscall(SYSCALL(open), (uptr)filename, flags);
	#endif
	}

	uptr internal_open(const char *filename, int flags, u32 mode) {
	#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	return internal_syscall(SYSCALL(openat), AT_FDCWD, (uptr)filename, flags,
	mode);
	#else
	return internal_syscall(SYSCALL(open), (uptr)filename, flags, mode);
	#endif
	}

	uptr internal_read(fd_t fd, void *buf, uptr count) {
	sptr res;
	HANDLE_EINTR(res,
	(sptr)internal_syscall(SYSCALL(read), fd, (uptr)buf, count));
	return res;
	}

	uptr internal_write(fd_t fd, const void *buf, uptr count) {
	sptr res;
	HANDLE_EINTR(res,
	(sptr)internal_syscall(SYSCALL(write), fd, (uptr)buf, count));
	return res;
	}

	uptr internal_ftruncate(fd_t fd, uptr size) {
	sptr res;
	HANDLE_EINTR(res, (sptr)internal_syscall(SYSCALL(ftruncate), fd,
	(OFF_T)size));
	return res;
	}

	#if !SANITIZER_LINUX_USES_64BIT_SYSCALLS && SANITIZER_LINUX
	static void stat64_to_stat(struct stat64 in, struct stat out) {
	internal_memset(out, 0, sizeof(*out));
	out->st_dev = in->st_dev;
	out->st_ino = in->st_ino;
	out->st_mode = in->st_mode;
	out->st_nlink = in->st_nlink;
	out->st_uid = in->st_uid;
	out->st_gid = in->st_gid;
	out->st_rdev = in->st_rdev;
	out->st_size = in->st_size;
	out->st_blksize = in->st_blksize;
	out->st_blocks = in->st_blocks;
	out->st_atime = in->st_atime;
	out->st_mtime = in->st_mtime;
	out->st_ctime = in->st_ctime;
	}
	#endif

	#if defined(__mips64)
	// Undefine compatibility macros from <sys/stat.h>
	// so that they would not clash with the kernel_stat
	// st_[a\|m\|c]time fields
	#undef st_atime
	#undef st_mtime
	#undef st_ctime
	#if defined(SANITIZER_ANDROID)
	// Bionic sys/stat.h defines additional macros
	// for compatibility with the old NDKs and
	// they clash with the kernel_stat structure
	// st_[a\|m\|c]time_nsec fields.
	#undef st_atime_nsec
	#undef st_mtime_nsec
	#undef st_ctime_nsec
	#endif
	static void kernel_stat_to_stat(struct kernel_stat in, struct stat out) {
	internal_memset(out, 0, sizeof(*out));
	out->st_dev = in->st_dev;
	out->st_ino = in->st_ino;
	out->st_mode = in->st_mode;
	out->st_nlink = in->st_nlink;
	out->st_uid = in->st_uid;
	out->st_gid = in->st_gid;
	out->st_rdev = in->st_rdev;
	out->st_size = in->st_size;
	out->st_blksize = in->st_blksize;
	out->st_blocks = in->st_blocks;
	#if defined(__USE_MISC) \|\| \
	defined(__USE_XOPEN2K8) \|\| \
	defined(SANITIZER_ANDROID)
	out->st_atim.tv_sec = in->st_atime;
	out->st_atim.tv_nsec = in->st_atime_nsec;
	out->st_mtim.tv_sec = in->st_mtime;
	out->st_mtim.tv_nsec = in->st_mtime_nsec;
	out->st_ctim.tv_sec = in->st_ctime;
	out->st_ctim.tv_nsec = in->st_ctime_nsec;
	#else
	out->st_atime = in->st_atime;
	out->st_atimensec = in->st_atime_nsec;
	out->st_mtime = in->st_mtime;
	out->st_mtimensec = in->st_mtime_nsec;
	out->st_ctime = in->st_ctime;
	out->st_atimensec = in->st_ctime_nsec;
	#endif
	}
	#endif

	uptr internal_stat(const char path, void buf) {
	#if SANITIZER_FREEBSD \|\| SANITIZER_OPENBSD
	return internal_syscall(SYSCALL(fstatat), AT_FDCWD, (uptr)path, (uptr)buf, 0);
	#elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
	0);
	#elif SANITIZER_LINUX_USES_64BIT_SYSCALLS
	# if defined(__mips64)
	// For mips64, stat syscall fills buffer in the format of kernel_stat
	struct kernel_stat kbuf;
	int res = internal_syscall(SYSCALL(stat), path, &kbuf);
	kernel_stat_to_stat(&kbuf, (struct stat *)buf);
	return res;
	# else
	return internal_syscall(SYSCALL(stat), (uptr)path, (uptr)buf);
	# endif
	#else
	struct stat64 buf64;
	int res = internal_syscall(SYSCALL(stat64), path, &buf64);
	stat64_to_stat(&buf64, (struct stat *)buf);
	return res;
	#endif
	}

	uptr internal_lstat(const char path, void buf) {
	#if SANITIZER_FREEBSD \|\| SANITIZER_OPENBSD
	return internal_syscall(SYSCALL(fstatat), AT_FDCWD, (uptr)path, (uptr)buf,
	AT_SYMLINK_NOFOLLOW);
	#elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
	AT_SYMLINK_NOFOLLOW);
	#elif SANITIZER_LINUX_USES_64BIT_SYSCALLS
	# if SANITIZER_MIPS64
	// For mips64, lstat syscall fills buffer in the format of kernel_stat
	struct kernel_stat kbuf;
	int res = internal_syscall(SYSCALL(lstat), path, &kbuf);
	kernel_stat_to_stat(&kbuf, (struct stat *)buf);
	return res;
	# else
	return internal_syscall(SYSCALL(lstat), (uptr)path, (uptr)buf);
	# endif
	#else
	struct stat64 buf64;
	int res = internal_syscall(SYSCALL(lstat64), path, &buf64);
	stat64_to_stat(&buf64, (struct stat *)buf);
	return res;
	#endif
	}

	uptr internal_fstat(fd_t fd, void *buf) {
	#if SANITIZER_FREEBSD \|\| SANITIZER_OPENBSD \|\| \
	SANITIZER_LINUX_USES_64BIT_SYSCALLS
	#if SANITIZER_MIPS64 && !SANITIZER_OPENBSD
	// For mips64, fstat syscall fills buffer in the format of kernel_stat
	struct kernel_stat kbuf;
	int res = internal_syscall(SYSCALL(fstat), fd, &kbuf);
	kernel_stat_to_stat(&kbuf, (struct stat *)buf);
	return res;
	# else
	return internal_syscall(SYSCALL(fstat), fd, (uptr)buf);
	# endif
	#else
	struct stat64 buf64;
	int res = internal_syscall(SYSCALL(fstat64), fd, &buf64);
	stat64_to_stat(&buf64, (struct stat *)buf);
	return res;
	#endif
	}

	uptr internal_filesize(fd_t fd) {
	struct stat st;
	if (internal_fstat(fd, &st))
	return -1;
	return (uptr)st.st_size;
	}

	uptr internal_dup(int oldfd) {
	return internal_syscall(SYSCALL(dup), oldfd);
	}

	uptr internal_dup2(int oldfd, int newfd) {
	#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	return internal_syscall(SYSCALL(dup3), oldfd, newfd, 0);
	#else
	return internal_syscall(SYSCALL(dup2), oldfd, newfd);
	#endif
	}

	uptr internal_readlink(const char path, char buf, uptr bufsize) {
	#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	return internal_syscall(SYSCALL(readlinkat), AT_FDCWD, (uptr)path, (uptr)buf,
	bufsize);
	#elif SANITIZER_OPENBSD
	return internal_syscall(SYSCALL(readlinkat), AT_FDCWD, (uptr)path, (uptr)buf,
	bufsize);
	#else
	return internal_syscall(SYSCALL(readlink), (uptr)path, (uptr)buf, bufsize);
	#endif
	}

	uptr internal_unlink(const char *path) {
	#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS \|\| SANITIZER_OPENBSD
	return internal_syscall(SYSCALL(unlinkat), AT_FDCWD, (uptr)path, 0);
	#else
	return internal_syscall(SYSCALL(unlink), (uptr)path);
	#endif
	}

	uptr internal_rename(const char oldpath, const char newpath) {
	#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS \|\| SANITIZER_OPENBSD
	return internal_syscall(SYSCALL(renameat), AT_FDCWD, (uptr)oldpath, AT_FDCWD,
	(uptr)newpath);
	#else
	return internal_syscall(SYSCALL(rename), (uptr)oldpath, (uptr)newpath);
	#endif
	}

	uptr internal_sched_yield() {
	return internal_syscall(SYSCALL(sched_yield));
	}

	void internal__exit(int exitcode) {
	#if SANITIZER_FREEBSD \|\| SANITIZER_OPENBSD
	internal_syscall(SYSCALL(exit), exitcode);
	#else
	internal_syscall(SYSCALL(exit_group), exitcode);
	#endif
	Die(); // Unreachable.
	}

	unsigned int internal_sleep(unsigned int seconds) {
	struct timespec ts;
	ts.tv_sec = seconds;
	ts.tv_nsec = 0;
	int res = internal_syscall(SYSCALL(nanosleep), &ts, &ts);
	if (res) return ts.tv_sec;
	return 0;
	}

	uptr internal_execve(const char filename, char const argv[],
	char *const envp[]) {
	return internal_syscall(SYSCALL(execve), (uptr)filename, (uptr)argv,
	(uptr)envp);
	}
	#endif // !SANITIZER_SOLARIS && !SANITIZER_NETBSD

	// ----------------- sanitizer_common.h
	bool FileExists(const char *filename) {
	if (ShouldMockFailureToOpen(filename))
	return false;
	struct stat st;
	#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	if (internal_syscall(SYSCALL(newfstatat), AT_FDCWD, filename, &st, 0))
	#else
	if (internal_stat(filename, &st))
	#endif
	return false;
	// Sanity check: filename is a regular file.
	return S_ISREG(st.st_mode);
	}

	#if !SANITIZER_NETBSD
	tid_t GetTid() {
	#if SANITIZER_FREEBSD
	long Tid;
	thr_self(&Tid);
	return Tid;
	#elif SANITIZER_OPENBSD
	return internal_syscall(SYSCALL(getthrid));
	#elif SANITIZER_SOLARIS
	return thr_self();
	#else
	return internal_syscall(SYSCALL(gettid));
	#endif
	}

	int TgKill(pid_t pid, tid_t tid, int sig) {
	#if SANITIZER_LINUX
	return internal_syscall(SYSCALL(tgkill), pid, tid, sig);
	#elif SANITIZER_FREEBSD
	return internal_syscall(SYSCALL(thr_kill2), pid, tid, sig);
	#elif SANITIZER_OPENBSD
	(void)pid;
	return internal_syscall(SYSCALL(thrkill), tid, sig, nullptr);
	#elif SANITIZER_SOLARIS
	(void)pid;
	return thr_kill(tid, sig);
	#endif
	}
	#endif

	#if !SANITIZER_SOLARIS && !SANITIZER_NETBSD
	u64 NanoTime() {
	#if SANITIZER_FREEBSD \|\| SANITIZER_OPENBSD
	timeval tv;
	#else
	kernel_timeval tv;
	#endif
	internal_memset(&tv, 0, sizeof(tv));
	internal_syscall(SYSCALL(gettimeofday), &tv, 0);
	return (u64)tv.tv_sec * 100010001000 + tv.tv_usec * 1000;
	}

	uptr internal_clock_gettime(__sanitizer_clockid_t clk_id, void *tp) {
	return internal_syscall(SYSCALL(clock_gettime), clk_id, tp);
	}
	#endif // !SANITIZER_SOLARIS && !SANITIZER_NETBSD

	// Like getenv, but reads env directly from /proc (on Linux) or parses the
	// 'environ' array (on some others) and does not use libc. This function
	// should be called first inside __asan_init.
	const char GetEnv(const char name) {
	#if SANITIZER_FREEBSD \|\| SANITIZER_NETBSD \|\| SANITIZER_OPENBSD \|\| \
	SANITIZER_SOLARIS
	if (::environ != 0) {
	uptr NameLen = internal_strlen(name);
	for (char *Env = ::environ; Env != 0; Env++) {
	if (internal_strncmp(Env, name, NameLen) == 0 && (Env)[NameLen] == '=')
	return (*Env) + NameLen + 1;
	}
	}
	return 0; // Not found.
	#elif SANITIZER_LINUX
	static char *environ;
	static uptr len;
	static bool inited;
	if (!inited) {
	inited = true;
	uptr environ_size;
	if (!ReadFileToBuffer("/proc/self/environ", &environ, &environ_size, &len))
	environ = nullptr;
	}
	if (!environ \|\| len == 0) return nullptr;
	uptr namelen = internal_strlen(name);
	const char *p = environ;
	while (*p != '\0') { // will happen at the \0\0 that terminates the buffer
	// proc file has the format NAME=value\0NAME=value\0NAME=value\0...
	const char* endp =
	(char*)internal_memchr(p, '\0', len - (p - environ));
	if (!endp) // this entry isn't NUL terminated
	return nullptr;
	else if (!internal_memcmp(p, name, namelen) && p[namelen] == '=') // Match.
	return p + namelen + 1; // point after =
	p = endp + 1;
	}
	return nullptr; // Not found.
	#else
	#error "Unsupported platform"
	#endif
	}

	#if !SANITIZER_FREEBSD && !SANITIZER_NETBSD && !SANITIZER_OPENBSD
	extern "C" {
	SANITIZER_WEAK_ATTRIBUTE extern void *__libc_stack_end;
	}
	#endif

	#if !SANITIZER_GO && !SANITIZER_FREEBSD && !SANITIZER_NETBSD && \
	!SANITIZER_OPENBSD
	static void ReadNullSepFileToArray(const char path, char **arr,
	int arr_size) {
	char *buff;
	uptr buff_size;
	uptr buff_len;
	arr = (char )MmapOrDie(arr_size sizeof(char *), "NullSepFileArray");
	if (!ReadFileToBuffer(path, &buff, &buff_size, &buff_len, 1024 * 1024)) {
	(*arr)[0] = nullptr;
	return;
	}
	(*arr)[0] = buff;
	int count, i;
	for (count = 1, i = 1; ; i++) {
	if (buff[i] == 0) {
	if (buff[i+1] == 0) break;
	(*arr)[count] = &buff[i+1];
	CHECK_LE(count, arr_size - 1); // FIXME: make this more flexible.
	count++;
	}
	}
	(*arr)[count] = nullptr;
	}
	#endif

	#if !SANITIZER_OPENBSD
	static void GetArgsAndEnv(char *argv, char *envp) {
	#if SANITIZER_FREEBSD
	// On FreeBSD, retrieving the argument and environment arrays is done via the
	// kern.ps_strings sysctl, which returns a pointer to a structure containing
	// this information. See also <sys/exec.h>.
	ps_strings *pss;
	uptr sz = sizeof(pss);
	if (internal_sysctlbyname("kern.ps_strings", &pss, &sz, NULL, 0) == -1) {
	Printf("sysctl kern.ps_strings failed\n");
	Die();
	}
	*argv = pss->ps_argvstr;
	*envp = pss->ps_envstr;
	#elif SANITIZER_NETBSD
	*argv = __ps_strings->ps_argvstr;
	*envp = __ps_strings->ps_envstr;
	#else // SANITIZER_FREEBSD
	#if !SANITIZER_GO
	if (&__libc_stack_end) {
	#endif // !SANITIZER_GO
	uptr* stack_end = (uptr*)__libc_stack_end;
	int argc = *stack_end;
	argv = (char*)(stack_end + 1);
	envp = (char*)(stack_end + argc + 2);
	#if !SANITIZER_GO
	} else {
	static const int kMaxArgv = 2000, kMaxEnvp = 2000;
	ReadNullSepFileToArray("/proc/self/cmdline", argv, kMaxArgv);
	ReadNullSepFileToArray("/proc/self/environ", envp, kMaxEnvp);
	}
	#endif // !SANITIZER_GO
	#endif // SANITIZER_FREEBSD
	}

	char **GetArgv() {
	char argv, envp;
	GetArgsAndEnv(&argv, &envp);
	return argv;
	}

	char **GetEnviron() {
	char argv, envp;
	GetArgsAndEnv(&argv, &envp);
	return envp;
	}

	#endif // !SANITIZER_OPENBSD

	#if !SANITIZER_SOLARIS
	enum MutexState {
	MtxUnlocked = 0,
	MtxLocked = 1,
	MtxSleeping = 2
	};

	BlockingMutex::BlockingMutex() {
	internal_memset(this, 0, sizeof(*this));
	}

	void BlockingMutex::Lock() {
	CHECK_EQ(owner_, 0);
	atomic_uint32_t m = reinterpret_cast<atomic_uint32_t >(&opaque_storage_);
	if (atomic_exchange(m, MtxLocked, memory_order_acquire) == MtxUnlocked)
	return;
	while (atomic_exchange(m, MtxSleeping, memory_order_acquire) != MtxUnlocked) {
	#if SANITIZER_FREEBSD
	_umtx_op(m, UMTX_OP_WAIT_UINT, MtxSleeping, 0, 0);
	#elif SANITIZER_NETBSD
	sched_yield(); /* No userspace futex-like synchronization */
	#else
	internal_syscall(SYSCALL(futex), (uptr)m, FUTEX_WAIT_PRIVATE, MtxSleeping,
	0, 0, 0);
	#endif
	}
	}

	void BlockingMutex::Unlock() {
	atomic_uint32_t m = reinterpret_cast<atomic_uint32_t >(&opaque_storage_);
	u32 v = atomic_exchange(m, MtxUnlocked, memory_order_release);
	CHECK_NE(v, MtxUnlocked);
	if (v == MtxSleeping) {
	#if SANITIZER_FREEBSD
	_umtx_op(m, UMTX_OP_WAKE, 1, 0, 0);
	#elif SANITIZER_NETBSD
	/* No userspace futex-like synchronization */
	#else
	internal_syscall(SYSCALL(futex), (uptr)m, FUTEX_WAKE_PRIVATE, 1, 0, 0, 0);
	#endif
	}
	}

	void BlockingMutex::CheckLocked() {
	atomic_uint32_t m = reinterpret_cast<atomic_uint32_t >(&opaque_storage_);
	CHECK_NE(MtxUnlocked, atomic_load(m, memory_order_relaxed));
	}
	#endif // !SANITIZER_SOLARIS

	// ----------------- sanitizer_linux.h
	// The actual size of this structure is specified by d_reclen.
	// Note that getdents64 uses a different structure format. We only provide the
	// 32-bit syscall here.
	#if SANITIZER_NETBSD
	// Not used
	#elif SANITIZER_OPENBSD
	// struct dirent is different for Linux and us. At this moment, we use only
	// d_fileno (Linux call this d_ino), d_reclen, and d_name.
	struct linux_dirent {
	u64 d_ino; // d_fileno
	u16 d_reclen;
	u16 d_namlen; // not used
	u8 d_type; // not used
	char d_name[NAME_MAX + 1];
	};
	#else
	struct linux_dirent {
	#if SANITIZER_X32 \|\| defined(__aarch64__)
	u64 d_ino;
	u64 d_off;
	#else
	unsigned long d_ino;
	unsigned long d_off;
	#endif
	unsigned short d_reclen;
	#ifdef __aarch64__
	unsigned char d_type;
	#endif
	char d_name[256];
	};
	#endif

	#if !SANITIZER_SOLARIS && !SANITIZER_NETBSD
	// Syscall wrappers.
	uptr internal_ptrace(int request, int pid, void addr, void data) {
	return internal_syscall(SYSCALL(ptrace), request, pid, (uptr)addr,
	(uptr)data);
	}

	uptr internal_waitpid(int pid, int *status, int options) {
	return internal_syscall(SYSCALL(wait4), pid, (uptr)status, options,
	0 /* rusage */);
	}

	uptr internal_getpid() {
	return internal_syscall(SYSCALL(getpid));
	}

	uptr internal_getppid() {
	return internal_syscall(SYSCALL(getppid));
	}

	uptr internal_getdents(fd_t fd, struct linux_dirent *dirp, unsigned int count) {
	#if SANITIZER_FREEBSD
	return internal_syscall(SYSCALL(getdirentries), fd, (uptr)dirp, count, NULL);
	#elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	return internal_syscall(SYSCALL(getdents64), fd, (uptr)dirp, count);
	#else
	return internal_syscall(SYSCALL(getdents), fd, (uptr)dirp, count);
	#endif
	}

	uptr internal_lseek(fd_t fd, OFF_T offset, int whence) {
	return internal_syscall(SYSCALL(lseek), fd, offset, whence);
	}

	#if SANITIZER_LINUX
	uptr internal_prctl(int option, uptr arg2, uptr arg3, uptr arg4, uptr arg5) {
	return internal_syscall(SYSCALL(prctl), option, arg2, arg3, arg4, arg5);
	}
	#endif

	uptr internal_sigaltstack(const void ss, void oss) {
	return internal_syscall(SYSCALL(sigaltstack), (uptr)ss, (uptr)oss);
	}

	int internal_fork() {
	#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
	return internal_syscall(SYSCALL(clone), SIGCHLD, 0);
	#else
	return internal_syscall(SYSCALL(fork));
	#endif
	}

	#if SANITIZER_FREEBSD \|\| SANITIZER_OPENBSD
	int internal_sysctl(const int name, unsigned int namelen, void oldp,
	uptr oldlenp, const void newp, uptr newlen) {
	#if SANITIZER_OPENBSD
	return sysctl(name, namelen, oldp, (size_t )oldlenp, (void )newp,
	(size_t)newlen);
	#else
	return internal_syscall(SYSCALL(__sysctl), name, namelen, oldp,
	(size_t *)oldlenp, newp, (size_t)newlen);
	#endif
	}

	#if SANITIZER_FREEBSD
	int internal_sysctlbyname(const char sname, void oldp, uptr *oldlenp,
	const void *newp, uptr newlen) {
	static decltype(sysctlbyname) *real = nullptr;
	if (!real)
	real = (decltype(sysctlbyname) *)dlsym(RTLD_NEXT, "sysctlbyname");
	CHECK(real);
	return real(sname, oldp, (size_t *)oldlenp, newp, (size_t)newlen);
	}
	#endif
	#endif

	#if SANITIZER_LINUX
	#define SA_RESTORER 0x04000000
	// Doesn't set sa_restorer if the caller did not set it, so use with caution
	//(see below).
	int internal_sigaction_norestorer(int signum, const void act, void oldact) {
	__sanitizer_kernel_sigaction_t k_act, k_oldact;
	internal_memset(&k_act, 0, sizeof(__sanitizer_kernel_sigaction_t));
	internal_memset(&k_oldact, 0, sizeof(__sanitizer_kernel_sigaction_t));
	const __sanitizer_sigaction u_act = (const __sanitizer_sigaction )act;
	__sanitizer_sigaction u_oldact = (__sanitizer_sigaction )oldact;
	if (u_act) {
	k_act.handler = u_act->handler;
	k_act.sigaction = u_act->sigaction;
	internal_memcpy(&k_act.sa_mask, &u_act->sa_mask,
	sizeof(__sanitizer_kernel_sigset_t));
	// Without SA_RESTORER kernel ignores the calls (probably returns EINVAL).
	k_act.sa_flags = u_act->sa_flags \| SA_RESTORER;
	// FIXME: most often sa_restorer is unset, however the kernel requires it
	// to point to a valid signal restorer that calls the rt_sigreturn syscall.
	// If sa_restorer passed to the kernel is NULL, the program may crash upon
	// signal delivery or fail to unwind the stack in the signal handler.
	// libc implementation of sigaction() passes its own restorer to
	// rt_sigaction, so we need to do the same (we'll need to reimplement the
	// restorers; for x86_64 the restorer address can be obtained from
	// oldact->sa_restorer upon a call to sigaction(xxx, NULL, oldact).
	#if !SANITIZER_ANDROID \|\| !SANITIZER_MIPS32
	k_act.sa_restorer = u_act->sa_restorer;
	#endif
	}

	uptr result = internal_syscall(SYSCALL(rt_sigaction), (uptr)signum,
	(uptr)(u_act ? &k_act : nullptr),
	(uptr)(u_oldact ? &k_oldact : nullptr),
	(uptr)sizeof(__sanitizer_kernel_sigset_t));

	if ((result == 0) && u_oldact) {
	u_oldact->handler = k_oldact.handler;
	u_oldact->sigaction = k_oldact.sigaction;
	internal_memcpy(&u_oldact->sa_mask, &k_oldact.sa_mask,
	sizeof(__sanitizer_kernel_sigset_t));
	u_oldact->sa_flags = k_oldact.sa_flags;
	#if !SANITIZER_ANDROID \|\| !SANITIZER_MIPS32
	u_oldact->sa_restorer = k_oldact.sa_restorer;
	#endif
	}
	return result;
	}
	#endif // SANITIZER_LINUX

	uptr internal_sigprocmask(int how, __sanitizer_sigset_t *set,
	__sanitizer_sigset_t *oldset) {
	#if SANITIZER_FREEBSD \|\| SANITIZER_OPENBSD
	return internal_syscall(SYSCALL(sigprocmask), how, set, oldset);
	#else
	__sanitizer_kernel_sigset_t k_set = (__sanitizer_kernel_sigset_t )set;
	__sanitizer_kernel_sigset_t k_oldset = (__sanitizer_kernel_sigset_t )oldset;
	return internal_syscall(SYSCALL(rt_sigprocmask), (uptr)how,
	(uptr)&k_set->sig[0], (uptr)&k_oldset->sig[0],
	sizeof(__sanitizer_kernel_sigset_t));
	#endif
	}

	void internal_sigfillset(__sanitizer_sigset_t *set) {
	internal_memset(set, 0xff, sizeof(*set));
	}

	void internal_sigemptyset(__sanitizer_sigset_t *set) {
	internal_memset(set, 0, sizeof(*set));
	}

	#if SANITIZER_LINUX
	void internal_sigdelset(__sanitizer_sigset_t *set, int signum) {
	signum -= 1;
	CHECK_GE(signum, 0);
	CHECK_LT(signum, sizeof(set) 8);
	__sanitizer_kernel_sigset_t k_set = (__sanitizer_kernel_sigset_t )set;
	const uptr idx = signum / (sizeof(k_set->sig[0]) * 8);
	const uptr bit = signum % (sizeof(k_set->sig[0]) * 8);
	k_set->sig[idx] &= ~(1 << bit);
	}

	bool internal_sigismember(__sanitizer_sigset_t *set, int signum) {
	signum -= 1;
	CHECK_GE(signum, 0);
	CHECK_LT(signum, sizeof(set) 8);
	__sanitizer_kernel_sigset_t k_set = (__sanitizer_kernel_sigset_t )set;
	const uptr idx = signum / (sizeof(k_set->sig[0]) * 8);
	const uptr bit = signum % (sizeof(k_set->sig[0]) * 8);
	return k_set->sig[idx] & (1 << bit);
	}
	#elif SANITIZER_FREEBSD
	void internal_sigdelset(__sanitizer_sigset_t *set, int signum) {
	sigset_t rset = reinterpret_cast<sigset_t >(set);
	sigdelset(rset, signum);
	}

	bool internal_sigismember(__sanitizer_sigset_t *set, int signum) {
	sigset_t rset = reinterpret_cast<sigset_t >(set);
	return sigismember(rset, signum);
	}
	#endif
	#endif // !SANITIZER_SOLARIS

	#if !SANITIZER_NETBSD
	// ThreadLister implementation.
	ThreadLister::ThreadLister(pid_t pid) : pid_(pid), buffer_(4096) {
	char task_directory_path[80];
	internal_snprintf(task_directory_path, sizeof(task_directory_path),
	"/proc/%d/task/", pid);
	descriptor_ = internal_open(task_directory_path, O_RDONLY \| O_DIRECTORY);
	if (internal_iserror(descriptor_)) {
	Report("Can't open /proc/%d/task for reading.\n", pid);
	}
	}

	ThreadLister::Result ThreadLister::ListThreads(
	InternalMmapVector<tid_t> *threads) {
	if (internal_iserror(descriptor_))
	return Error;
	internal_lseek(descriptor_, 0, SEEK_SET);
	threads->clear();

	Result result = Ok;
	for (bool first_read = true;; first_read = false) {
	// Resize to max capacity if it was downsized by IsAlive.
	buffer_.resize(buffer_.capacity());
	CHECK_GE(buffer_.size(), 4096);
	uptr read = internal_getdents(
	descriptor_, (struct linux_dirent *)buffer_.data(), buffer_.size());
	if (!read)
	return result;
	if (internal_iserror(read)) {
	Report("Can't read directory entries from /proc/%d/task.\n", pid_);
	return Error;
	}

	for (uptr begin = (uptr)buffer_.data(), end = begin + read; begin < end;) {
	struct linux_dirent entry = (struct linux_dirent )begin;
	begin += entry->d_reclen;
	if (entry->d_ino == 1) {
	// Inode 1 is for bad blocks and also can be a reason for early return.
	// Should be emitted if kernel tried to output terminating thread.
	// See proc_task_readdir implementation in Linux.
	result = Incomplete;
	}
	if (entry->d_ino && entry->d_name >= '0' && entry->d_name <= '9')
	threads->push_back(internal_atoll(entry->d_name));
	}

	// Now we are going to detect short-read or early EOF. In such cases Linux
	// can return inconsistent list with missing alive threads.
	// Code will just remember that the list can be incomplete but it will
	// continue reads to return as much as possible.
	if (!first_read) {
	// The first one was a short-read by definition.
	result = Incomplete;
	} else if (read > buffer_.size() - 1024) {
	// Read was close to the buffer size. So double the size and assume the
	// worst.
	buffer_.resize(buffer_.size() * 2);
	result = Incomplete;
	} else if (!threads->empty() && !IsAlive(threads->back())) {
	// Maybe Linux early returned from read on terminated thread (!pid_alive)
	// and failed to restore read position.
	// See next_tid and proc_task_instantiate in Linux.
	result = Incomplete;
	}
	}
	}

	bool ThreadLister::IsAlive(int tid) {
	// /proc/%d/task/%d/status uses same call to detect alive threads as
	// proc_task_readdir. See task_state implementation in Linux.
	char path[80];
	internal_snprintf(path, sizeof(path), "/proc/%d/task/%d/status", pid_, tid);
	if (!ReadFileToVector(path, &buffer_) \|\| buffer_.empty())
	return false;
	buffer_.push_back(0);
	static const char kPrefix[] = "\nPPid:";
	const char *field = internal_strstr(buffer_.data(), kPrefix);
	if (!field)
	return false;
	field += internal_strlen(kPrefix);
	return (int)internal_atoll(field) != 0;
	}

	ThreadLister::~ThreadLister() {
	if (!internal_iserror(descriptor_))
	internal_close(descriptor_);
	}
	#endif

	#if SANITIZER_WORDSIZE == 32
	// Take care of unusable kernel area in top gigabyte.
	static uptr GetKernelAreaSize() {
	#if SANITIZER_LINUX && !SANITIZER_X32
	const uptr gbyte = 1UL << 30;

	// Firstly check if there are writable segments
	// mapped to top gigabyte (e.g. stack).
	MemoryMappingLayout proc_maps(/cache_enabled/true);
	if (proc_maps.Error())
	return 0;
	MemoryMappedSegment segment;
	while (proc_maps.Next(&segment)) {
	if ((segment.end >= 3 * gbyte) && segment.IsWritable()) return 0;
	}

	#if !SANITIZER_ANDROID
	// Even if nothing is mapped, top Gb may still be accessible
	// if we are running on 64-bit kernel.
	// Uname may report misleading results if personality type
	// is modified (e.g. under schroot) so check this as well.
	struct utsname uname_info;
	int pers = personality(0xffffffffUL);
	if (!(pers & PER_MASK)
	&& uname(&uname_info) == 0
	&& internal_strstr(uname_info.machine, "64"))
	return 0;
	#endif // SANITIZER_ANDROID

	// Top gigabyte is reserved for kernel.
	return gbyte;
	#else
	return 0;
	#endif // SANITIZER_LINUX && !SANITIZER_X32
	}
	#endif // SANITIZER_WORDSIZE == 32

	uptr GetMaxVirtualAddress() {
	#if (SANITIZER_NETBSD \|\| SANITIZER_OPENBSD) && defined(__x86_64__)
	return 0x7f7ffffff000ULL; // (0x00007f8000000000 - PAGE_SIZE)
	#elif SANITIZER_WORDSIZE == 64
	# if defined(__powerpc64__) \|\| defined(__aarch64__)
	// On PowerPC64 we have two different address space layouts: 44- and 46-bit.
	// We somehow need to figure out which one we are using now and choose
	// one of 0x00000fffffffffffUL and 0x00003fffffffffffUL.
	// Note that with 'ulimit -s unlimited' the stack is moved away from the top
	// of the address space, so simply checking the stack address is not enough.
	// This should (does) work for both PowerPC64 Endian modes.
	// Similarly, aarch64 has multiple address space layouts: 39, 42 and 47-bit.
	return (1ULL << (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1)) - 1;
	# elif defined(__mips64)
	return (1ULL << 40) - 1; // 0x000000ffffffffffUL;
	# elif defined(__s390x__)
	return (1ULL << 53) - 1; // 0x001fffffffffffffUL;
	#elif defined(__sparc__)
	return ~(uptr)0;
	# else
	return (1ULL << 47) - 1; // 0x00007fffffffffffUL;
	# endif
	#else // SANITIZER_WORDSIZE == 32
	# if defined(__s390__)
	return (1ULL << 31) - 1; // 0x7fffffff;
	# else
	return (1ULL << 32) - 1; // 0xffffffff;
	# endif
	#endif // SANITIZER_WORDSIZE
	}

	uptr GetMaxUserVirtualAddress() {
	uptr addr = GetMaxVirtualAddress();
	#if SANITIZER_WORDSIZE == 32 && !defined(__s390__)
	if (!common_flags()->full_address_space)
	addr -= GetKernelAreaSize();
	CHECK_LT(reinterpret_cast<uptr>(&addr), addr);
	#endif
	return addr;
	}

	#if !SANITIZER_ANDROID
	uptr GetPageSize() {
	#if SANITIZER_LINUX && (defined(__x86_64__) \|\| defined(__i386__))
	return EXEC_PAGESIZE;
	#elif SANITIZER_USE_GETAUXVAL
	return getauxval(AT_PAGESZ);
	#elif SANITIZER_FREEBSD \|\| SANITIZER_NETBSD
	// Use sysctl as sysconf can trigger interceptors internally.
	int pz = 0;
	uptr pzl = sizeof(pz);
	int mib[2] = {CTL_HW, HW_PAGESIZE};
	int rv = internal_sysctl(mib, 2, &pz, &pzl, nullptr, 0);
	CHECK_EQ(rv, 0);
	return (uptr)pz;
	#else
	return sysconf(_SC_PAGESIZE); // EXEC_PAGESIZE may not be trustworthy.
	#endif
	}
	#endif // !SANITIZER_ANDROID

	#if !SANITIZER_OPENBSD
	uptr ReadBinaryName(/out/char *buf, uptr buf_len) {
	#if SANITIZER_SOLARIS
	const char *default_module_name = getexecname();
	CHECK_NE(default_module_name, NULL);
	return internal_snprintf(buf, buf_len, "%s", default_module_name);
	#else
	#if SANITIZER_FREEBSD \|\| SANITIZER_NETBSD
	#if SANITIZER_FREEBSD
	const int Mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
	#else
	const int Mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
	#endif
	const char *default_module_name = "kern.proc.pathname";
	uptr Size = buf_len;
	bool IsErr =
	(internal_sysctl(Mib, ARRAY_SIZE(Mib), buf, &Size, NULL, 0) != 0);
	int readlink_error = IsErr ? errno : 0;
	uptr module_name_len = Size;
	#else
	const char *default_module_name = "/proc/self/exe";
	uptr module_name_len = internal_readlink(
	default_module_name, buf, buf_len);
	int readlink_error;
	bool IsErr = internal_iserror(module_name_len, &readlink_error);
	#endif // SANITIZER_SOLARIS
	if (IsErr) {
	// We can't read binary name for some reason, assume it's unknown.
	Report("WARNING: reading executable name failed with errno %d, "
	"some stack frames may not be symbolized\n", readlink_error);
	module_name_len = internal_snprintf(buf, buf_len, "%s",
	default_module_name);
	CHECK_LT(module_name_len, buf_len);
	}
	return module_name_len;
	#endif
	}
	#endif // !SANITIZER_OPENBSD

	uptr ReadLongProcessName(/out/ char *buf, uptr buf_len) {
	#if SANITIZER_LINUX
	char *tmpbuf;
	uptr tmpsize;
	uptr tmplen;
	if (ReadFileToBuffer("/proc/self/cmdline", &tmpbuf, &tmpsize, &tmplen,
	1024 * 1024)) {
	internal_strncpy(buf, tmpbuf, buf_len);
	UnmapOrDie(tmpbuf, tmpsize);
	return internal_strlen(buf);
	}
	#endif
	return ReadBinaryName(buf, buf_len);
	}

	// Match full names of the form /path/to/base_name{-,.}*
	bool LibraryNameIs(const char full_name, const char base_name) {
	const char *name = full_name;
	// Strip path.
	while (*name != '\0') name++;
	while (name > full_name && *name != '/') name--;
	if (*name == '/') name++;
	uptr base_name_length = internal_strlen(base_name);
	if (internal_strncmp(name, base_name, base_name_length)) return false;
	return (name[base_name_length] == '-' \|\| name[base_name_length] == '.');
	}

	#if !SANITIZER_ANDROID
	// Call cb for each region mapped by map.
	void ForEachMappedRegion(link_map map, void (cb)(const void *, uptr)) {
	CHECK_NE(map, nullptr);
	#if !SANITIZER_FREEBSD && !SANITIZER_OPENBSD
	typedef ElfW(Phdr) Elf_Phdr;
	typedef ElfW(Ehdr) Elf_Ehdr;
	#endif // !SANITIZER_FREEBSD && !SANITIZER_OPENBSD
	char base = (char )map->l_addr;
	Elf_Ehdr ehdr = (Elf_Ehdr )base;
	char *phdrs = base + ehdr->e_phoff;
	char phdrs_end = phdrs + ehdr->e_phnum ehdr->e_phentsize;

	// Find the segment with the minimum base so we can "relocate" the p_vaddr
	// fields. Typically ET_DYN objects (DSOs) have base of zero and ET_EXEC
	// objects have a non-zero base.
	uptr preferred_base = (uptr)-1;
	for (char *iter = phdrs; iter != phdrs_end; iter += ehdr->e_phentsize) {
	Elf_Phdr phdr = (Elf_Phdr )iter;
	if (phdr->p_type == PT_LOAD && preferred_base > (uptr)phdr->p_vaddr)
	preferred_base = (uptr)phdr->p_vaddr;
	}

	// Compute the delta from the real base to get a relocation delta.
	sptr delta = (uptr)base - preferred_base;
	// Now we can figure out what the loader really mapped.
	for (char *iter = phdrs; iter != phdrs_end; iter += ehdr->e_phentsize) {
	Elf_Phdr phdr = (Elf_Phdr )iter;
	if (phdr->p_type == PT_LOAD) {
	uptr seg_start = phdr->p_vaddr + delta;
	uptr seg_end = seg_start + phdr->p_memsz;
	// None of these values are aligned. We consider the ragged edges of the
	// load command as defined, since they are mapped from the file.
	seg_start = RoundDownTo(seg_start, GetPageSizeCached());
	seg_end = RoundUpTo(seg_end, GetPageSizeCached());
	cb((void *)seg_start, seg_end - seg_start);
	}
	}
	}
	#endif

	#if defined(__x86_64__) && SANITIZER_LINUX
	// We cannot use glibc's clone wrapper, because it messes with the child
	// task's TLS. It writes the PID and TID of the child task to its thread
	// descriptor, but in our case the child task shares the thread descriptor with
	// the parent (because we don't know how to allocate a new thread
	// descriptor to keep glibc happy). So the stock version of clone(), when
	// used with CLONE_VM, would end up corrupting the parent's thread descriptor.
	uptr internal_clone(int (fn)(void ), void child_stack, int flags, void arg,
	int parent_tidptr, void newtls, int *child_tidptr) {
	long long res;
	if (!fn \|\| !child_stack)
	return -EINVAL;
	CHECK_EQ(0, (uptr)child_stack % 16);
	child_stack = (char )child_stack - 2 sizeof(unsigned long long);
	((unsigned long long *)child_stack)[0] = (uptr)fn;
	((unsigned long long *)child_stack)[1] = (uptr)arg;
	register void *r8 __asm__("r8") = newtls;
	register int *r10 __asm__("r10") = child_tidptr;
	__asm__ __volatile__(
	/* %rax = syscall(%rax = SYSCALL(clone),
	* %rdi = flags,
	* %rsi = child_stack,
	* %rdx = parent_tidptr,
	* %r8 = new_tls,
	* %r10 = child_tidptr)
	*/
	"syscall\n"

	/* if (%rax != 0)
	* return;
	*/
	"testq %%rax,%%rax\n"
	"jnz 1f\n"

	/* In the child. Terminate unwind chain. */
	// XXX: We should also terminate the CFI unwind chain
	// here. Unfortunately clang 3.2 doesn't support the
	// necessary CFI directives, so we skip that part.
	"xorq %%rbp,%%rbp\n"

	/* Call "fn(arg)". */
	"popq %%rax\n"
	"popq %%rdi\n"
	"call *%%rax\n"

	/* Call _exit(%rax). */
	"movq %%rax,%%rdi\n"
	"movq %2,%%rax\n"
	"syscall\n"

	/* Return to parent. */
	"1:\n"
	: "=a" (res)
	: "a"(SYSCALL(clone)), "i"(SYSCALL(exit)),
	"S"(child_stack),
	"D"(flags),
	"d"(parent_tidptr),
	"r"(r8),
	"r"(r10)
	: "memory", "r11", "rcx");
	return res;
	}
	#elif defined(__mips__)
	uptr internal_clone(int (fn)(void ), void child_stack, int flags, void arg,
	int parent_tidptr, void newtls, int *child_tidptr) {
	long long res;
	if (!fn \|\| !child_stack)
	return -EINVAL;
	CHECK_EQ(0, (uptr)child_stack % 16);
	child_stack = (char )child_stack - 2 sizeof(unsigned long long);
	((unsigned long long *)child_stack)[0] = (uptr)fn;
	((unsigned long long *)child_stack)[1] = (uptr)arg;
	register void *a3 __asm__("$7") = newtls;
	register int *a4 __asm__("$8") = child_tidptr;
	// We don't have proper CFI directives here because it requires alot of code
	// for very marginal benefits.
	__asm__ __volatile__(
	/* $v0 = syscall($v0 = __NR_clone,
	* $a0 = flags,
	* $a1 = child_stack,
	* $a2 = parent_tidptr,
	* $a3 = new_tls,
	* $a4 = child_tidptr)
	*/
	".cprestore 16;\n"
	"move $4,%1;\n"
	"move $5,%2;\n"
	"move $6,%3;\n"
	"move $7,%4;\n"
	/* Store the fifth argument on stack
	* if we are using 32-bit abi.
	*/
	#if SANITIZER_WORDSIZE == 32
	"lw %5,16($29);\n"
	#else
	"move $8,%5;\n"
	#endif
	"li $2,%6;\n"
	"syscall;\n"

	/* if ($v0 != 0)
	* return;
	*/
	"bnez $2,1f;\n"

	/* Call "fn(arg)". */
	#if SANITIZER_WORDSIZE == 32
	#ifdef __BIG_ENDIAN__
	"lw $25,4($29);\n"
	"lw $4,12($29);\n"
	#else
	"lw $25,0($29);\n"
	"lw $4,8($29);\n"
	#endif
	#else
	"ld $25,0($29);\n"
	"ld $4,8($29);\n"
	#endif
	"jal $25;\n"

	/* Call _exit($v0). */
	"move $4,$2;\n"
	"li $2,%7;\n"
	"syscall;\n"

	/* Return to parent. */
	"1:\n"
	: "=r" (res)
	: "r"(flags),
	"r"(child_stack),
	"r"(parent_tidptr),
	"r"(a3),
	"r"(a4),
	"i"(__NR_clone),
	"i"(__NR_exit)
	: "memory", "$29" );
	return res;
	}
	#elif defined(__aarch64__)
	uptr internal_clone(int (fn)(void ), void child_stack, int flags, void arg,
	int parent_tidptr, void newtls, int *child_tidptr) {
	long long res;
	if (!fn \|\| !child_stack)
	return -EINVAL;
	CHECK_EQ(0, (uptr)child_stack % 16);
	child_stack = (char )child_stack - 2 sizeof(unsigned long long);
	((unsigned long long *)child_stack)[0] = (uptr)fn;
	((unsigned long long *)child_stack)[1] = (uptr)arg;

	register int (__fn)(void ) __asm__("x0") = fn;
	register void *__stack __asm__("x1") = child_stack;
	register int __flags __asm__("x2") = flags;
	register void *__arg __asm__("x3") = arg;
	register int *__ptid __asm__("x4") = parent_tidptr;
	register void *__tls __asm__("x5") = newtls;
	register int *__ctid __asm__("x6") = child_tidptr;

	__asm__ __volatile__(
	"mov x0,x2\n" /* flags */
	"mov x2,x4\n" /* ptid */
	"mov x3,x5\n" /* tls */
	"mov x4,x6\n" /* ctid */
	"mov x8,%9\n" /* clone */

	"svc 0x0\n"

	/* if (%r0 != 0)
	* return %r0;
	*/
	"cmp x0, #0\n"
	"bne 1f\n"

	/* In the child, now. Call "fn(arg)". */
	"ldp x1, x0, [sp], #16\n"
	"blr x1\n"

	/* Call _exit(%r0). */
	"mov x8, %10\n"
	"svc 0x0\n"
	"1:\n"

	: "=r" (res)
	: "i"(-EINVAL),
	"r"(__fn), "r"(__stack), "r"(__flags), "r"(__arg),
	"r"(__ptid), "r"(__tls), "r"(__ctid),
	"i"(__NR_clone), "i"(__NR_exit)
	: "x30", "memory");
	return res;
	}
	#elif defined(__powerpc64__)
	uptr internal_clone(int (fn)(void ), void child_stack, int flags, void arg,
	int parent_tidptr, void newtls, int *child_tidptr) {
	long long res;
	// Stack frame structure.
	#if SANITIZER_PPC64V1
	// Back chain == 0 (SP + 112)
	// Frame (112 bytes):
	// Parameter save area (SP + 48), 8 doublewords
	// TOC save area (SP + 40)
	// Link editor doubleword (SP + 32)
	// Compiler doubleword (SP + 24)
	// LR save area (SP + 16)
	// CR save area (SP + 8)
	// Back chain (SP + 0)
	# define FRAME_SIZE 112
	# define FRAME_TOC_SAVE_OFFSET 40
	#elif SANITIZER_PPC64V2
	// Back chain == 0 (SP + 32)
	// Frame (32 bytes):
	// TOC save area (SP + 24)
	// LR save area (SP + 16)
	// CR save area (SP + 8)
	// Back chain (SP + 0)
	# define FRAME_SIZE 32
	# define FRAME_TOC_SAVE_OFFSET 24
	#else
	# error "Unsupported PPC64 ABI"
	#endif
	if (!fn \|\| !child_stack)
	return -EINVAL;
	CHECK_EQ(0, (uptr)child_stack % 16);

	register int (__fn)(void ) __asm__("r3") = fn;
	register void *__cstack __asm__("r4") = child_stack;
	register int __flags __asm__("r5") = flags;
	register void *__arg __asm__("r6") = arg;
	register int *__ptidptr __asm__("r7") = parent_tidptr;
	register void *__newtls __asm__("r8") = newtls;
	register int *__ctidptr __asm__("r9") = child_tidptr;

	__asm__ __volatile__(
	/* fn and arg are saved across the syscall */
	"mr 28, %5\n\t"
	"mr 27, %8\n\t"

	/* syscall
	r0 == __NR_clone
	r3 == flags
	r4 == child_stack
	r5 == parent_tidptr
	r6 == newtls
	r7 == child_tidptr */
	"mr 3, %7\n\t"
	"mr 5, %9\n\t"
	"mr 6, %10\n\t"
	"mr 7, %11\n\t"
	"li 0, %3\n\t"
	"sc\n\t"

	/* Test if syscall was successful */
	"cmpdi cr1, 3, 0\n\t"
	"crandc cr14+eq, cr14+eq, cr0*4+so\n\t"
	"bne- cr1, 1f\n\t"

	/* Set up stack frame */
	"li 29, 0\n\t"
	"stdu 29, -8(1)\n\t"
	"stdu 1, -%12(1)\n\t"
	/* Do the function call */
	"std 2, %13(1)\n\t"
	#if SANITIZER_PPC64V1
	"ld 0, 0(28)\n\t"
	"ld 2, 8(28)\n\t"
	"mtctr 0\n\t"
	#elif SANITIZER_PPC64V2
	"mr 12, 28\n\t"
	"mtctr 12\n\t"
	#else
	# error "Unsupported PPC64 ABI"
	#endif
	"mr 3, 27\n\t"
	"bctrl\n\t"
	"ld 2, %13(1)\n\t"

	/* Call _exit(r3) */
	"li 0, %4\n\t"
	"sc\n\t"

	/* Return to parent */
	"1:\n\t"
	"mr %0, 3\n\t"
	: "=r" (res)
	: "0" (-1),
	"i" (EINVAL),
	"i" (__NR_clone),
	"i" (__NR_exit),
	"r" (__fn),
	"r" (__cstack),
	"r" (__flags),
	"r" (__arg),
	"r" (__ptidptr),
	"r" (__newtls),
	"r" (__ctidptr),
	"i" (FRAME_SIZE),
	"i" (FRAME_TOC_SAVE_OFFSET)
	: "cr0", "cr1", "memory", "ctr", "r0", "r27", "r28", "r29");
	return res;
	}
	#elif defined(__i386__) && SANITIZER_LINUX
	uptr internal_clone(int (fn)(void ), void child_stack, int flags, void arg,
	int parent_tidptr, void newtls, int *child_tidptr) {
	int res;
	if (!fn \|\| !child_stack)
	return -EINVAL;
	CHECK_EQ(0, (uptr)child_stack % 16);
	child_stack = (char )child_stack - 7 sizeof(unsigned int);
	((unsigned int *)child_stack)[0] = (uptr)flags;
	((unsigned int *)child_stack)[1] = (uptr)0;
	((unsigned int *)child_stack)[2] = (uptr)fn;
	((unsigned int *)child_stack)[3] = (uptr)arg;
	__asm__ __volatile__(
	/* %eax = syscall(%eax = SYSCALL(clone),
	* %ebx = flags,
	* %ecx = child_stack,
	* %edx = parent_tidptr,
	* %esi = new_tls,
	* %edi = child_tidptr)
	*/

	/* Obtain flags */
	"movl (%%ecx), %%ebx\n"
	/* Do the system call */
	"pushl %%ebx\n"
	"pushl %%esi\n"
	"pushl %%edi\n"
	/* Remember the flag value. */
	"movl %%ebx, (%%ecx)\n"
	"int $0x80\n"
	"popl %%edi\n"
	"popl %%esi\n"
	"popl %%ebx\n"

	/* if (%eax != 0)
	* return;
	*/

	"test %%eax,%%eax\n"
	"jnz 1f\n"

	/* terminate the stack frame */
	"xorl %%ebp,%%ebp\n"
	/* Call FN. */
	"call *%%ebx\n"
	#ifdef PIC
	"call here\n"
	"here:\n"
	"popl %%ebx\n"
	"addl $_GLOBAL_OFFSET_TABLE_+[.-here], %%ebx\n"
	#endif
	/* Call exit */
	"movl %%eax, %%ebx\n"
	"movl %2, %%eax\n"
	"int $0x80\n"
	"1:\n"
	: "=a" (res)
	: "a"(SYSCALL(clone)), "i"(SYSCALL(exit)),
	"c"(child_stack),
	"d"(parent_tidptr),
	"S"(newtls),
	"D"(child_tidptr)
	: "memory");
	return res;
	}
	#elif defined(__arm__) && SANITIZER_LINUX
	uptr internal_clone(int (fn)(void ), void child_stack, int flags, void arg,
	int parent_tidptr, void newtls, int *child_tidptr) {
	unsigned int res;
	if (!fn \|\| !child_stack)
	return -EINVAL;
	child_stack = (char )child_stack - 2 sizeof(unsigned int);
	((unsigned int *)child_stack)[0] = (uptr)fn;
	((unsigned int *)child_stack)[1] = (uptr)arg;
	register int r0 __asm__("r0") = flags;
	register void *r1 __asm__("r1") = child_stack;
	register int *r2 __asm__("r2") = parent_tidptr;
	register void *r3 __asm__("r3") = newtls;
	register int *r4 __asm__("r4") = child_tidptr;
	register int r7 __asm__("r7") = __NR_clone;

	#if __ARM_ARCH > 4 \|\| defined (__ARM_ARCH_4T__)
	# define ARCH_HAS_BX
	#endif
	#if __ARM_ARCH > 4
	# define ARCH_HAS_BLX
	#endif

	#ifdef ARCH_HAS_BX
	# ifdef ARCH_HAS_BLX
	# define BLX(R) "blx " #R "\n"
	# else
	# define BLX(R) "mov lr, pc; bx " #R "\n"
	# endif
	#else
	# define BLX(R) "mov lr, pc; mov pc," #R "\n"
	#endif

	__asm__ __volatile__(
	/* %r0 = syscall(%r7 = SYSCALL(clone),
	* %r0 = flags,
	* %r1 = child_stack,
	* %r2 = parent_tidptr,
	* %r3 = new_tls,
	* %r4 = child_tidptr)
	*/

	/* Do the system call */
	"swi 0x0\n"

	/* if (%r0 != 0)
	* return %r0;
	*/
	"cmp r0, #0\n"
	"bne 1f\n"

	/* In the child, now. Call "fn(arg)". */
	"ldr r0, [sp, #4]\n"
	"ldr ip, [sp], #8\n"
	BLX(ip)
	/* Call _exit(%r0). */
	"mov r7, %7\n"
	"swi 0x0\n"
	"1:\n"
	"mov %0, r0\n"
	: "=r"(res)
	: "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r7),
	"i"(__NR_exit)
	: "memory");
	return res;
	}
	#endif // defined(__x86_64__) && SANITIZER_LINUX

	#if SANITIZER_ANDROID
	#if __ANDROID_API__ < 21
	extern "C" __attribute__((weak)) int dl_iterate_phdr(
	int ()(struct dl_phdr_info , size_t, void ), void );
	#endif

	static int dl_iterate_phdr_test_cb(struct dl_phdr_info *info, size_t size,
	void *data) {
	// Any name starting with "lib" indicates a bug in L where library base names
	// are returned instead of paths.
	if (info->dlpi_name && info->dlpi_name[0] == 'l' &&
	info->dlpi_name[1] == 'i' && info->dlpi_name[2] == 'b') {
	(bool )data = true;
	return 1;
	}
	return 0;
	}

	static atomic_uint32_t android_api_level;

	static AndroidApiLevel AndroidDetectApiLevelStatic() {
	#if __ANDROID_API__ <= 19
	return ANDROID_KITKAT;
	#elif __ANDROID_API__ <= 22
	return ANDROID_LOLLIPOP_MR1;
	#else
	return ANDROID_POST_LOLLIPOP;
	#endif
	}

	static AndroidApiLevel AndroidDetectApiLevel() {
	if (!&dl_iterate_phdr)
	return ANDROID_KITKAT; // K or lower
	bool base_name_seen = false;
	dl_iterate_phdr(dl_iterate_phdr_test_cb, &base_name_seen);
	if (base_name_seen)
	return ANDROID_LOLLIPOP_MR1; // L MR1
	return ANDROID_POST_LOLLIPOP; // post-L
	// Plain L (API level 21) is completely broken wrt ASan and not very
	// interesting to detect.
	}

	extern "C" __attribute__((weak)) void* _DYNAMIC;

	AndroidApiLevel AndroidGetApiLevel() {
	AndroidApiLevel level =
	(AndroidApiLevel)atomic_load(&android_api_level, memory_order_relaxed);
	if (level) return level;
	level = &_DYNAMIC == nullptr ? AndroidDetectApiLevelStatic()
	: AndroidDetectApiLevel();
	atomic_store(&android_api_level, level, memory_order_relaxed);
	return level;
	}

	#endif

	static HandleSignalMode GetHandleSignalModeImpl(int signum) {
	switch (signum) {
	case SIGABRT:
	return common_flags()->handle_abort;
	case SIGILL:
	return common_flags()->handle_sigill;
	case SIGTRAP:
	return common_flags()->handle_sigtrap;
	case SIGFPE:
	return common_flags()->handle_sigfpe;
	case SIGSEGV:
	return common_flags()->handle_segv;
	case SIGBUS:
	return common_flags()->handle_sigbus;
	}
	return kHandleSignalNo;
	}

	HandleSignalMode GetHandleSignalMode(int signum) {
	HandleSignalMode result = GetHandleSignalModeImpl(signum);
	if (result == kHandleSignalYes && !common_flags()->allow_user_segv_handler)
	return kHandleSignalExclusive;
	return result;
	}

	#if !SANITIZER_GO
	void internal_start_thread(void(func)(void arg), void arg) {
	// Start the thread with signals blocked, otherwise it can steal user signals.
	__sanitizer_sigset_t set, old;
	internal_sigfillset(&set);
	#if SANITIZER_LINUX && !SANITIZER_ANDROID
	// Glibc uses SIGSETXID signal during setuid call. If this signal is blocked
	// on any thread, setuid call hangs (see test/tsan/setuid.c).
	internal_sigdelset(&set, 33);
	#endif
	internal_sigprocmask(SIG_SETMASK, &set, &old);
	void *th;
	real_pthread_create(&th, nullptr, (void()(void *arg))func, arg);
	internal_sigprocmask(SIG_SETMASK, &old, nullptr);
	return th;
	}

	void internal_join_thread(void *th) {
	real_pthread_join(th, nullptr);
	}
	#else
	void internal_start_thread(void (func)(void ), void arg) { return 0; }

	void internal_join_thread(void *th) {}
	#endif

	#if defined(__aarch64__)
	// Android headers in the older NDK releases miss this definition.
	struct __sanitizer_esr_context {
	struct _aarch64_ctx head;
	uint64_t esr;
	};

	static bool Aarch64GetESR(ucontext_t ucontext, u64 esr) {
	static const u32 kEsrMagic = 0x45535201;
	u8 *aux = ucontext->uc_mcontext.__reserved;
	while (true) {
	_aarch64_ctx ctx = (_aarch64_ctx )aux;
	if (ctx->size == 0) break;
	if (ctx->magic == kEsrMagic) {
	esr = ((__sanitizer_esr_context )ctx)->esr;
	return true;
	}
	aux += ctx->size;
	}
	return false;
	}
	#endif

	#if SANITIZER_OPENBSD
	using Context = sigcontext;
	#else
	using Context = ucontext_t;
	#endif

	SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
	Context ucontext = (Context )context;
	#if defined(__x86_64__) \|\| defined(__i386__)
	static const uptr PF_WRITE = 1U << 1;
	#if SANITIZER_FREEBSD
	uptr err = ucontext->uc_mcontext.mc_err;
	#elif SANITIZER_NETBSD
	uptr err = ucontext->uc_mcontext.__gregs[_REG_ERR];
	#elif SANITIZER_OPENBSD
	uptr err = ucontext->sc_err;
	#elif SANITIZER_SOLARIS && defined(__i386__)
	const int Err = 13;
	uptr err = ucontext->uc_mcontext.gregs[Err];
	#else
	uptr err = ucontext->uc_mcontext.gregs[REG_ERR];
	#endif // SANITIZER_FREEBSD
	return err & PF_WRITE ? WRITE : READ;
	#elif defined(__mips__)
	uint32_t *exception_source;
	uint32_t faulty_instruction;
	uint32_t op_code;

	exception_source = (uint32_t *)ucontext->uc_mcontext.pc;
	faulty_instruction = (uint32_t)(*exception_source);

	op_code = (faulty_instruction >> 26) & 0x3f;

	// FIXME: Add support for FPU, microMIPS, DSP, MSA memory instructions.
	switch (op_code) {
	case 0x28: // sb
	case 0x29: // sh
	case 0x2b: // sw
	case 0x3f: // sd
	#if __mips_isa_rev < 6
	case 0x2c: // sdl
	case 0x2d: // sdr
	case 0x2a: // swl
	case 0x2e: // swr
	#endif
	return SignalContext::WRITE;

	case 0x20: // lb
	case 0x24: // lbu
	case 0x21: // lh
	case 0x25: // lhu
	case 0x23: // lw
	case 0x27: // lwu
	case 0x37: // ld
	#if __mips_isa_rev < 6
	case 0x1a: // ldl
	case 0x1b: // ldr
	case 0x22: // lwl
	case 0x26: // lwr
	#endif
	return SignalContext::READ;
	#if __mips_isa_rev == 6
	case 0x3b: // pcrel
	op_code = (faulty_instruction >> 19) & 0x3;
	switch (op_code) {
	case 0x1: // lwpc
	case 0x2: // lwupc
	return SignalContext::READ;
	}
	#endif
	}
	return SignalContext::UNKNOWN;
	#elif defined(__arm__)
	static const uptr FSR_WRITE = 1U << 11;
	uptr fsr = ucontext->uc_mcontext.error_code;
	return fsr & FSR_WRITE ? WRITE : READ;
	#elif defined(__aarch64__)
	static const u64 ESR_ELx_WNR = 1U << 6;
	u64 esr;
	if (!Aarch64GetESR(ucontext, &esr)) return UNKNOWN;
	return esr & ESR_ELx_WNR ? WRITE : READ;
	#elif defined(__sparc__)
	// Decode the instruction to determine the access type.
	// From OpenSolaris $SRC/uts/sun4/os/trap.c (get_accesstype).
	#if SANITIZER_SOLARIS
	uptr pc = ucontext->uc_mcontext.gregs[REG_PC];
	#else
	// Historical BSDism here.
	struct sigcontext scontext = (struct sigcontext )context;
	#if defined(__arch64__)
	uptr pc = scontext->sigc_regs.tpc;
	#else
	uptr pc = scontext->si_regs.pc;
	#endif
	#endif
	u32 instr = (u32 )pc;
	return (instr >> 21) & 1 ? WRITE: READ;
	#else
	(void)ucontext;
	return UNKNOWN; // FIXME: Implement.
	#endif
	}

	void SignalContext::DumpAllRegisters(void *context) {
	// FIXME: Implement this.
	}

	static void GetPcSpBp(void context, uptr pc, uptr sp, uptr bp) {
	#if SANITIZER_NETBSD
	// This covers all NetBSD architectures
	ucontext_t ucontext = (ucontext_t )context;
	*pc = _UC_MACHINE_PC(ucontext);
	*bp = _UC_MACHINE_FP(ucontext);
	*sp = _UC_MACHINE_SP(ucontext);
	#elif defined(__arm__)
	ucontext_t ucontext = (ucontext_t)context;
	*pc = ucontext->uc_mcontext.arm_pc;
	*bp = ucontext->uc_mcontext.arm_fp;
	*sp = ucontext->uc_mcontext.arm_sp;
	#elif defined(__aarch64__)
	ucontext_t ucontext = (ucontext_t)context;
	*pc = ucontext->uc_mcontext.pc;
	*bp = ucontext->uc_mcontext.regs[29];
	*sp = ucontext->uc_mcontext.sp;
	#elif defined(__hppa__)
	ucontext_t ucontext = (ucontext_t)context;
	*pc = ucontext->uc_mcontext.sc_iaoq[0];
	/* GCC uses %r3 whenever a frame pointer is needed. */
	*bp = ucontext->uc_mcontext.sc_gr[3];
	*sp = ucontext->uc_mcontext.sc_gr[30];
	#elif defined(__x86_64__)
	# if SANITIZER_FREEBSD
	ucontext_t ucontext = (ucontext_t)context;
	*pc = ucontext->uc_mcontext.mc_rip;
	*bp = ucontext->uc_mcontext.mc_rbp;
	*sp = ucontext->uc_mcontext.mc_rsp;
	#elif SANITIZER_OPENBSD
	sigcontext ucontext = (sigcontext )context;
	*pc = ucontext->sc_rip;
	*bp = ucontext->sc_rbp;
	*sp = ucontext->sc_rsp;
	# else
	ucontext_t ucontext = (ucontext_t)context;
	*pc = ucontext->uc_mcontext.gregs[REG_RIP];
	*bp = ucontext->uc_mcontext.gregs[REG_RBP];
	*sp = ucontext->uc_mcontext.gregs[REG_RSP];
	# endif
	#elif defined(__i386__)
	# if SANITIZER_FREEBSD
	ucontext_t ucontext = (ucontext_t)context;
	*pc = ucontext->uc_mcontext.mc_eip;
	*bp = ucontext->uc_mcontext.mc_ebp;
	*sp = ucontext->uc_mcontext.mc_esp;
	#elif SANITIZER_OPENBSD
	sigcontext ucontext = (sigcontext )context;
	*pc = ucontext->sc_eip;
	*bp = ucontext->sc_ebp;
	*sp = ucontext->sc_esp;
	# else
	ucontext_t ucontext = (ucontext_t)context;
	# if SANITIZER_SOLARIS
	/* Use the numeric values: the symbolic ones are undefined by llvm
	include/llvm/Support/Solaris.h. */
	# ifndef REG_EIP
	# define REG_EIP 14 // REG_PC
	# endif
	# ifndef REG_EBP
	# define REG_EBP 6 // REG_FP
	# endif
	# ifndef REG_ESP
	# define REG_ESP 17 // REG_SP
	# endif
	# endif
	*pc = ucontext->uc_mcontext.gregs[REG_EIP];
	*bp = ucontext->uc_mcontext.gregs[REG_EBP];
	*sp = ucontext->uc_mcontext.gregs[REG_ESP];
	# endif
	#elif defined(__powerpc__) \|\| defined(__powerpc64__)
	ucontext_t ucontext = (ucontext_t)context;
	*pc = ucontext->uc_mcontext.regs->nip;
	*sp = ucontext->uc_mcontext.regs->gpr[PT_R1];
	// The powerpc{,64}-linux ABIs do not specify r31 as the frame
	// pointer, but GCC always uses r31 when we need a frame pointer.
	*bp = ucontext->uc_mcontext.regs->gpr[PT_R31];
	#elif defined(__sparc__)
	#if defined(__arch64__) \|\| defined(__sparcv9)
	#define STACK_BIAS 2047
	#else
	#define STACK_BIAS 0
	# endif
	# if SANITIZER_SOLARIS
	ucontext_t ucontext = (ucontext_t )context;
	*pc = ucontext->uc_mcontext.gregs[REG_PC];
	*sp = ucontext->uc_mcontext.gregs[REG_O6] + STACK_BIAS;
	#else
	// Historical BSDism here.
	struct sigcontext scontext = (struct sigcontext )context;
	#if defined(__arch64__)
	*pc = scontext->sigc_regs.tpc;
	*sp = scontext->sigc_regs.u_regs[14] + STACK_BIAS;
	#else
	*pc = scontext->si_regs.pc;
	*sp = scontext->si_regs.u_regs[14];
	#endif
	# endif
	bp = (uptr)((uhwptr )*sp)[14] + STACK_BIAS;
	#elif defined(__mips__)
	ucontext_t ucontext = (ucontext_t)context;
	*pc = ucontext->uc_mcontext.pc;
	*bp = ucontext->uc_mcontext.gregs[30];
	*sp = ucontext->uc_mcontext.gregs[29];
	#elif defined(__s390__)
	ucontext_t ucontext = (ucontext_t)context;
	# if defined(__s390x__)
	*pc = ucontext->uc_mcontext.psw.addr;
	# else
	*pc = ucontext->uc_mcontext.psw.addr & 0x7fffffff;
	# endif
	*bp = ucontext->uc_mcontext.gregs[11];
	*sp = ucontext->uc_mcontext.gregs[15];
	#else
	# error "Unsupported arch"
	#endif
	}

	void SignalContext::InitPcSpBp() { GetPcSpBp(context, &pc, &sp, &bp); }

	void InitializePlatformEarly() {
	// Do nothing.
	}

	void MaybeReexec() {
	// No need to re-exec on Linux.
	}

	void CheckASLR() {
	#if SANITIZER_NETBSD
	int mib[3];
	int paxflags;
	uptr len = sizeof(paxflags);

	mib[0] = CTL_PROC;
	mib[1] = internal_getpid();
	mib[2] = PROC_PID_PAXFLAGS;

	if (UNLIKELY(internal_sysctl(mib, 3, &paxflags, &len, NULL, 0) == -1)) {
	Printf("sysctl failed\n");
	Die();
	}

	if (UNLIKELY(paxflags & CTL_PROC_PAXFLAGS_ASLR)) {
	Printf("This sanitizer is not compatible with enabled ASLR\n");
	Die();
	}
	#elif SANITIZER_PPC64V2
	// Disable ASLR for Linux PPC64LE.
	int old_personality = personality(0xffffffff);
	if (old_personality != -1 && (old_personality & ADDR_NO_RANDOMIZE) == 0) {
	VReport(1, "WARNING: Program is being run with address space layout "
	"randomization (ASLR) enabled which prevents the thread and "
	"memory sanitizers from working on powerpc64le.\n"
	"ASLR will be disabled and the program re-executed.\n");
	CHECK_NE(personality(old_personality \| ADDR_NO_RANDOMIZE), -1);
	ReExec();
	}
	+#elif SANITIZER_FREEBSD
	+ int aslr_pie;
	+ uptr len = sizeof(aslr_pie);
	+#if SANITIZER_WORDSIZE == 64
	+ if (UNLIKELY(internal_sysctlbyname("kern.elf64.aslr.pie_enable",
	+ &aslr_pie, &len, NULL, 0) == -1)) {
	+ // We're making things less 'dramatic' here since
	+ // the OID is not necessarily guaranteed to be here
	+ // just yet regarding FreeBSD release
	+ return;
	+ }
	+
	+ if (aslr_pie > 0) {
	+ Printf("This sanitizer is not compatible with enabled ASLR "
	+ "and binaries compiled with PIE\n");
	+ Die();
	+ }
	+#endif
	+ // there might be 32 bits compat for 64 bits
	+ if (UNLIKELY(internal_sysctlbyname("kern.elf32.aslr.pie_enable",
	+ &aslr_pie, &len, NULL, 0) == -1)) {
	+ return;
	+ }
	+
	+ if (aslr_pie > 0) {
	+ Printf("This sanitizer is not compatible with enabled ASLR "
	+ "and binaries compiled with PIE\n");
	+ Die();
	+ }
	#else
	// Do nothing
	#endif
	}

	void CheckMPROTECT() {
	#if SANITIZER_NETBSD
	int mib[3];
	int paxflags;
	uptr len = sizeof(paxflags);

	mib[0] = CTL_PROC;
	mib[1] = internal_getpid();
	mib[2] = PROC_PID_PAXFLAGS;

	if (UNLIKELY(internal_sysctl(mib, 3, &paxflags, &len, NULL, 0) == -1)) {
	Printf("sysctl failed\n");
	Die();
	}

	if (UNLIKELY(paxflags & CTL_PROC_PAXFLAGS_MPROTECT)) {
	Printf("This sanitizer is not compatible with enabled MPROTECT\n");
	Die();
	}
	#else
	// Do nothing
	#endif
	}

	void PrintModuleMap() { }

	void CheckNoDeepBind(const char *filename, int flag) {
	#ifdef RTLD_DEEPBIND
	if (flag & RTLD_DEEPBIND) {
	Report(
	"You are trying to dlopen a %s shared library with RTLD_DEEPBIND flag"
	" which is incompatibe with sanitizer runtime "
	"(see https://github.com/google/sanitizers/issues/611 for details"
	"). If you want to run %s library under sanitizers please remove "
	"RTLD_DEEPBIND from dlopen flags.\n",
	filename, filename);
	Die();
	}
	#endif
	}

	uptr FindAvailableMemoryRange(uptr size, uptr alignment, uptr left_padding,
	uptr *largest_gap_found,
	uptr *max_occupied_addr) {
	UNREACHABLE("FindAvailableMemoryRange is not available");
	return 0;
	}

	bool GetRandom(void *buffer, uptr length, bool blocking) {
	if (!buffer \|\| !length \|\| length > 256)
	return false;
	#if SANITIZER_USE_GETENTROPY
	uptr rnd = getentropy(buffer, length);
	int rverrno = 0;
	if (internal_iserror(rnd, &rverrno) && rverrno == EFAULT)
	return false;
	else if (rnd == 0)
	return true;
	#endif // SANITIZER_USE_GETENTROPY

	#if SANITIZER_USE_GETRANDOM
	static atomic_uint8_t skip_getrandom_syscall;
	if (!atomic_load_relaxed(&skip_getrandom_syscall)) {
	// Up to 256 bytes, getrandom will not be interrupted.
	uptr res = internal_syscall(SYSCALL(getrandom), buffer, length,
	blocking ? 0 : GRND_NONBLOCK);
	int rverrno = 0;
	if (internal_iserror(res, &rverrno) && rverrno == ENOSYS)
	atomic_store_relaxed(&skip_getrandom_syscall, 1);
	else if (res == length)
	return true;
	}
	#endif // SANITIZER_USE_GETRANDOM
	// Up to 256 bytes, a read off /dev/urandom will not be interrupted.
	// blocking is moot here, O_NONBLOCK has no effect when opening /dev/urandom.
	uptr fd = internal_open("/dev/urandom", O_RDONLY);
	if (internal_iserror(fd))
	return false;
	uptr res = internal_read(fd, buffer, length);
	if (internal_iserror(res))
	return false;
	internal_close(fd);
	return true;
	}

	} // namespace __sanitizer

	#endif
	Index: projects/clang900-import/contrib/compiler-rt
	===================================================================
	--- projects/clang900-import/contrib/compiler-rt (revision 351721)
	+++ projects/clang900-import/contrib/compiler-rt (revision 351722)

	Property changes on: projects/clang900-import/contrib/compiler-rt
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/compiler-rt/dist-release_90:r351684-351721
	Index: projects/clang900-import/contrib/libc++/include/__locale
	===================================================================
	--- projects/clang900-import/contrib/libc++/include/__locale (revision 351721)
	+++ projects/clang900-import/contrib/libc++/include/__locale (revision 351722)
	@@ -1,1553 +1,1553 @@
	// -- C++ --
	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP___LOCALE
	#define _LIBCPP___LOCALE

	#include <__config>
	#include <string>
	#include <memory>
	#include <utility>
	#include <mutex>
	#include <cstdint>
	#include <cctype>
	#include <locale.h>
	#if defined(_LIBCPP_MSVCRT_LIKE)
	# include <cstring>
	# include <support/win32/locale_win32.h>
	#elif defined(_AIX)
	# include <support/ibm/xlocale.h>
	#elif defined(__ANDROID__)
	# include <support/android/locale_bionic.h>
	#elif defined(__sun__)
	# include <xlocale.h>
	# include <support/solaris/xlocale.h>
	#elif defined(_NEWLIB_VERSION)
	# include <support/newlib/xlocale.h>
	#elif (defined(__APPLE__) \|\| defined(__FreeBSD__) \
	\|\| defined(__EMSCRIPTEN__) \|\| defined(__IBMCPP__))
	# include <xlocale.h>
	#elif defined(__Fuchsia__)
	# include <support/fuchsia/xlocale.h>
	#elif defined(__wasi__)
	// WASI libc uses musl's locales support.
	# include <support/musl/xlocale.h>
	#elif defined(_LIBCPP_HAS_MUSL_LIBC)
	# include <support/musl/xlocale.h>
	#endif

	#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
	#pragma GCC system_header
	#endif

	_LIBCPP_BEGIN_NAMESPACE_STD

	#if !defined(_LIBCPP_LOCALE__L_EXTENSIONS)
	struct __libcpp_locale_guard {
	_LIBCPP_INLINE_VISIBILITY
	__libcpp_locale_guard(locale_t& __loc) : __old_loc_(uselocale(__loc)) {}

	_LIBCPP_INLINE_VISIBILITY
	~__libcpp_locale_guard() {
	if (__old_loc_)
	uselocale(__old_loc_);
	}

	locale_t __old_loc_;
	private:
	__libcpp_locale_guard(__libcpp_locale_guard const&);
	__libcpp_locale_guard& operator=(__libcpp_locale_guard const&);
	};
	#elif defined(_LIBCPP_MSVCRT_LIKE)
	struct __libcpp_locale_guard {
	__libcpp_locale_guard(locale_t __l) :
	__status(_configthreadlocale(_ENABLE_PER_THREAD_LOCALE)) {
	// Setting the locale can be expensive even when the locale given is
	// already the current locale, so do an explicit check to see if the
	// current locale is already the one we want.
	const char* __lc = __setlocale(nullptr);
	// If every category is the same, the locale string will simply be the
	// locale name, otherwise it will be a semicolon-separated string listing
	// each category. In the second case, we know at least one category won't
	// be what we want, so we only have to check the first case.
	if (strcmp(__l.__get_locale(), __lc) != 0) {
	__locale_all = _strdup(__lc);
	if (__locale_all == nullptr)
	__throw_bad_alloc();
	__setlocale(__l.__get_locale());
	}
	}
	~__libcpp_locale_guard() {
	// The CRT documentation doesn't explicitly say, but setlocale() does the
	// right thing when given a semicolon-separated list of locale settings
	// for the different categories in the same format as returned by
	// setlocale(LC_ALL, nullptr).
	if (__locale_all != nullptr) {
	__setlocale(__locale_all);
	free(__locale_all);
	}
	_configthreadlocale(__status);
	}
	static const char* __setlocale(const char* __locale) {
	const char* __new_locale = setlocale(LC_ALL, __locale);
	if (__new_locale == nullptr)
	__throw_bad_alloc();
	return __new_locale;
	}
	int __status;
	char* __locale_all = nullptr;
	};
	#endif


	class _LIBCPP_TYPE_VIS locale;

	template <class _Facet>
	_LIBCPP_INLINE_VISIBILITY
	bool
	has_facet(const locale&) _NOEXCEPT;

	template <class _Facet>
	_LIBCPP_INLINE_VISIBILITY
	const _Facet&
	use_facet(const locale&);

	class _LIBCPP_TYPE_VIS locale
	{
	public:
	// types:
	class _LIBCPP_TYPE_VIS facet;
	class _LIBCPP_TYPE_VIS id;

	typedef int category;
	_LIBCPP_AVAILABILITY_LOCALE_CATEGORY
	static const category // values assigned here are for exposition only
	none = 0,
	collate = LC_COLLATE_MASK,
	ctype = LC_CTYPE_MASK,
	monetary = LC_MONETARY_MASK,
	numeric = LC_NUMERIC_MASK,
	time = LC_TIME_MASK,
	messages = LC_MESSAGES_MASK,
	all = collate \| ctype \| monetary \| numeric \| time \| messages;

	// construct/copy/destroy:
	locale() _NOEXCEPT;
	locale(const locale&) _NOEXCEPT;
	explicit locale(const char*);
	explicit locale(const string&);
	locale(const locale&, const char*, category);
	locale(const locale&, const string&, category);
	template <class _Facet>
	_LIBCPP_INLINE_VISIBILITY locale(const locale&, _Facet*);
	locale(const locale&, const locale&, category);

	~locale();

	const locale& operator=(const locale&) _NOEXCEPT;

	template <class _Facet>
	_LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
	locale combine(const locale&) const;

	// locale operations:
	string name() const;
	bool operator==(const locale&) const;
	bool operator!=(const locale& __y) const {return !(*this == __y);}
	template <class _CharT, class _Traits, class _Allocator>
	_LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
	bool operator()(const basic_string<_CharT, _Traits, _Allocator>&,
	const basic_string<_CharT, _Traits, _Allocator>&) const;

	// global locale objects:
	static locale global(const locale&);
	static const locale& classic();

	private:
	class __imp;
	__imp* __locale_;

	void __install_ctor(const locale&, facet*, long);
	static locale& __global();
	bool has_facet(id&) const;
	const facet* use_facet(id&) const;

	template <class _Facet> friend bool has_facet(const locale&) _NOEXCEPT;
	template <class _Facet> friend const _Facet& use_facet(const locale&);
	};

	class _LIBCPP_TYPE_VIS locale::facet
	: public __shared_count
	{
	protected:
	_LIBCPP_INLINE_VISIBILITY
	explicit facet(size_t __refs = 0)
	: __shared_count(static_cast<long>(__refs)-1) {}

	virtual ~facet();

	// facet(const facet&) = delete; // effectively done in __shared_count
	// void operator=(const facet&) = delete;
	private:
	virtual void __on_zero_shared() _NOEXCEPT;
	};

	class _LIBCPP_TYPE_VIS locale::id
	{
	once_flag __flag_;
	int32_t __id_;

	static int32_t __next_id;
	public:
	_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR id() :__id_(0) {}
	private:
	void __init();
	void operator=(const id&); // = delete;
	id(const id&); // = delete;
	public: // only needed for tests
	long __get();

	friend class locale;
	friend class locale::__imp;
	};

	template <class _Facet>
	inline _LIBCPP_INLINE_VISIBILITY
	locale::locale(const locale& __other, _Facet* __f)
	{
	__install_ctor(__other, __f, __f ? __f->id.__get() : 0);
	}

	template <class _Facet>
	locale
	locale::combine(const locale& __other) const
	{
	if (!_VSTD::has_facet<_Facet>(__other))
	__throw_runtime_error("locale::combine: locale missing facet");

	return locale(*this, &const_cast<_Facet&>(_VSTD::use_facet<_Facet>(__other)));
	}

	template <class _Facet>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	has_facet(const locale& __l) _NOEXCEPT
	{
	return __l.has_facet(_Facet::id);
	}

	template <class _Facet>
	inline _LIBCPP_INLINE_VISIBILITY
	const _Facet&
	use_facet(const locale& __l)
	{
	return static_cast<const _Facet&>(*__l.use_facet(_Facet::id));
	}

	// template <class _CharT> class collate;

	template <class _CharT>
	class _LIBCPP_TEMPLATE_VIS collate
	: public locale::facet
	{
	public:
	typedef _CharT char_type;
	typedef basic_string<char_type> string_type;

	_LIBCPP_INLINE_VISIBILITY
	explicit collate(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_INLINE_VISIBILITY
	int compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const
	{
	return do_compare(__lo1, __hi1, __lo2, __hi2);
	}

	// FIXME(EricWF): The _LIBCPP_ALWAYS_INLINE is needed on Windows to work
	// around a dllimport bug that expects an external instantiation.
	_LIBCPP_INLINE_VISIBILITY
	_LIBCPP_ALWAYS_INLINE
	string_type transform(const char_type* __lo, const char_type* __hi) const
	{
	return do_transform(__lo, __hi);
	}

	_LIBCPP_INLINE_VISIBILITY
	long hash(const char_type* __lo, const char_type* __hi) const
	{
	return do_hash(__lo, __hi);
	}

	static locale::id id;

	protected:
	~collate();
	virtual int do_compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const;
	virtual string_type do_transform(const char_type* __lo, const char_type* __hi) const
	{return string_type(__lo, __hi);}
	virtual long do_hash(const char_type* __lo, const char_type* __hi) const;
	};

	template <class _CharT> locale::id collate<_CharT>::id;

	template <class _CharT>
	collate<_CharT>::~collate()
	{
	}

	template <class _CharT>
	int
	collate<_CharT>::do_compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const
	{
	for (; __lo2 != __hi2; ++__lo1, ++__lo2)
	{
	if (__lo1 == __hi1 \|\| __lo1 < __lo2)
	return -1;
	if (__lo2 < __lo1)
	return 1;
	}
	return __lo1 != __hi1;
	}

	template <class _CharT>
	long
	collate<_CharT>::do_hash(const char_type* __lo, const char_type* __hi) const
	{
	size_t __h = 0;
	const size_t __sr = __CHAR_BIT__ * sizeof(size_t) - 8;
	const size_t __mask = size_t(0xF) << (__sr + 4);
	for(const char_type* __p = __lo; __p != __hi; ++__p)
	{
	__h = (__h << 4) + static_cast<size_t>(*__p);
	size_t __g = __h & __mask;
	__h ^= __g \| (__g >> __sr);
	}
	return static_cast<long>(__h);
	}

	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate<char>)
	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate<wchar_t>)

	// template <class CharT> class collate_byname;

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS collate_byname;

	template <>
	class _LIBCPP_TYPE_VIS collate_byname<char>
	: public collate<char>
	{
	locale_t __l;
	public:
	typedef char char_type;
	typedef basic_string<char_type> string_type;

	explicit collate_byname(const char* __n, size_t __refs = 0);
	explicit collate_byname(const string& __n, size_t __refs = 0);

	protected:
	~collate_byname();
	virtual int do_compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const;
	virtual string_type do_transform(const char_type* __lo, const char_type* __hi) const;
	};

	template <>
	class _LIBCPP_TYPE_VIS collate_byname<wchar_t>
	: public collate<wchar_t>
	{
	locale_t __l;
	public:
	typedef wchar_t char_type;
	typedef basic_string<char_type> string_type;

	explicit collate_byname(const char* __n, size_t __refs = 0);
	explicit collate_byname(const string& __n, size_t __refs = 0);

	protected:
	~collate_byname();

	virtual int do_compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const;
	virtual string_type do_transform(const char_type* __lo, const char_type* __hi) const;
	};

	template <class _CharT, class _Traits, class _Allocator>
	bool
	locale::operator()(const basic_string<_CharT, _Traits, _Allocator>& __x,
	const basic_string<_CharT, _Traits, _Allocator>& __y) const
	{
	return _VSTD::use_facet<_VSTD::collate<_CharT> >(*this).compare(
	__x.data(), __x.data() + __x.size(),
	__y.data(), __y.data() + __y.size()) < 0;
	}

	// template <class charT> class ctype

	class _LIBCPP_TYPE_VIS ctype_base
	{
	public:
	#if defined(__GLIBC__)
	typedef unsigned short mask;
	static const mask space = _ISspace;
	static const mask print = _ISprint;
	static const mask cntrl = _IScntrl;
	static const mask upper = _ISupper;
	static const mask lower = _ISlower;
	static const mask alpha = _ISalpha;
	static const mask digit = _ISdigit;
	static const mask punct = _ISpunct;
	static const mask xdigit = _ISxdigit;
	static const mask blank = _ISblank;
	#if defined(__mips__)
	- static const mask __regex_word = static_cast<char_class_type>(_ISbit(15));
	+ static const mask __regex_word = static_cast<mask>(_ISbit(15));
	#else
	static const mask __regex_word = 0x80;
	#endif
	#elif defined(_LIBCPP_MSVCRT_LIKE)
	typedef unsigned short mask;
	static const mask space = _SPACE;
	static const mask print = _BLANK\|_PUNCT\|_ALPHA\|_DIGIT;
	static const mask cntrl = _CONTROL;
	static const mask upper = _UPPER;
	static const mask lower = _LOWER;
	static const mask alpha = _ALPHA;
	static const mask digit = _DIGIT;
	static const mask punct = _PUNCT;
	static const mask xdigit = _HEX;
	static const mask blank = _BLANK;
	static const mask __regex_word = 0x80;
	# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT
	#elif defined(__APPLE__) \|\| defined(__FreeBSD__) \|\| defined(__EMSCRIPTEN__) \|\| defined(__NetBSD__)
	# ifdef __APPLE__
	typedef __uint32_t mask;
	# elif defined(__FreeBSD__)
	typedef unsigned long mask;
	# elif defined(__EMSCRIPTEN__) \|\| defined(__NetBSD__)
	typedef unsigned short mask;
	# endif
	static const mask space = _CTYPE_S;
	static const mask print = _CTYPE_R;
	static const mask cntrl = _CTYPE_C;
	static const mask upper = _CTYPE_U;
	static const mask lower = _CTYPE_L;
	static const mask alpha = _CTYPE_A;
	static const mask digit = _CTYPE_D;
	static const mask punct = _CTYPE_P;
	static const mask xdigit = _CTYPE_X;

	# if defined(__NetBSD__)
	static const mask blank = _CTYPE_BL;
	// NetBSD defines classes up to 0x2000
	// see sys/ctype_bits.h, _CTYPE_Q
	static const mask __regex_word = 0x8000;
	# else
	static const mask blank = _CTYPE_B;
	static const mask __regex_word = 0x80;
	# endif
	#elif defined(__sun__) \|\| defined(_AIX)
	typedef unsigned int mask;
	static const mask space = _ISSPACE;
	static const mask print = _ISPRINT;
	static const mask cntrl = _ISCNTRL;
	static const mask upper = _ISUPPER;
	static const mask lower = _ISLOWER;
	static const mask alpha = _ISALPHA;
	static const mask digit = _ISDIGIT;
	static const mask punct = _ISPUNCT;
	static const mask xdigit = _ISXDIGIT;
	static const mask blank = _ISBLANK;
	static const mask __regex_word = 0x80;
	#elif defined(_NEWLIB_VERSION)
	// Same type as Newlib's _ctype_ array in newlib/libc/include/ctype.h.
	typedef char mask;
	static const mask space = _S;
	static const mask print = _P \| _U \| _L \| _N \| _B;
	static const mask cntrl = _C;
	static const mask upper = _U;
	static const mask lower = _L;
	static const mask alpha = _U \| _L;
	static const mask digit = _N;
	static const mask punct = _P;
	static const mask xdigit = _X \| _N;
	static const mask blank = _B;
	static const mask __regex_word = 0x80;
	# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT
	# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA
	# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_XDIGIT
	#else
	typedef unsigned long mask;
	static const mask space = 1<<0;
	static const mask print = 1<<1;
	static const mask cntrl = 1<<2;
	static const mask upper = 1<<3;
	static const mask lower = 1<<4;
	static const mask alpha = 1<<5;
	static const mask digit = 1<<6;
	static const mask punct = 1<<7;
	static const mask xdigit = 1<<8;
	static const mask blank = 1<<9;
	static const mask __regex_word = 1<<10;
	#endif
	static const mask alnum = alpha \| digit;
	static const mask graph = alnum \| punct;

	_LIBCPP_INLINE_VISIBILITY ctype_base() {}
	};

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS ctype;

	template <>
	class _LIBCPP_TYPE_VIS ctype<wchar_t>
	: public locale::facet,
	public ctype_base
	{
	public:
	typedef wchar_t char_type;

	_LIBCPP_INLINE_VISIBILITY
	explicit ctype(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_INLINE_VISIBILITY
	bool is(mask __m, char_type __c) const
	{
	return do_is(__m, __c);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* is(const char_type* __low, const char_type* __high, mask* __vec) const
	{
	return do_is(__low, __high, __vec);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* scan_is(mask __m, const char_type* __low, const char_type* __high) const
	{
	return do_scan_is(__m, __low, __high);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* scan_not(mask __m, const char_type* __low, const char_type* __high) const
	{
	return do_scan_not(__m, __low, __high);
	}

	_LIBCPP_INLINE_VISIBILITY
	char_type toupper(char_type __c) const
	{
	return do_toupper(__c);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* toupper(char_type* __low, const char_type* __high) const
	{
	return do_toupper(__low, __high);
	}

	_LIBCPP_INLINE_VISIBILITY
	char_type tolower(char_type __c) const
	{
	return do_tolower(__c);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* tolower(char_type* __low, const char_type* __high) const
	{
	return do_tolower(__low, __high);
	}

	_LIBCPP_INLINE_VISIBILITY
	char_type widen(char __c) const
	{
	return do_widen(__c);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char* widen(const char* __low, const char* __high, char_type* __to) const
	{
	return do_widen(__low, __high, __to);
	}

	_LIBCPP_INLINE_VISIBILITY
	char narrow(char_type __c, char __dfault) const
	{
	return do_narrow(__c, __dfault);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* narrow(const char_type* __low, const char_type* __high, char __dfault, char* __to) const
	{
	return do_narrow(__low, __high, __dfault, __to);
	}

	static locale::id id;

	protected:
	~ctype();
	virtual bool do_is(mask __m, char_type __c) const;
	virtual const char_type* do_is(const char_type* __low, const char_type* __high, mask* __vec) const;
	virtual const char_type* do_scan_is(mask __m, const char_type* __low, const char_type* __high) const;
	virtual const char_type* do_scan_not(mask __m, const char_type* __low, const char_type* __high) const;
	virtual char_type do_toupper(char_type) const;
	virtual const char_type* do_toupper(char_type* __low, const char_type* __high) const;
	virtual char_type do_tolower(char_type) const;
	virtual const char_type* do_tolower(char_type* __low, const char_type* __high) const;
	virtual char_type do_widen(char) const;
	virtual const char* do_widen(const char* __low, const char* __high, char_type* __dest) const;
	virtual char do_narrow(char_type, char __dfault) const;
	virtual const char_type* do_narrow(const char_type* __low, const char_type* __high, char __dfault, char* __dest) const;
	};

	template <>
	class _LIBCPP_TYPE_VIS ctype<char>
	: public locale::facet, public ctype_base
	{
	const mask* __tab_;
	bool __del_;
	public:
	typedef char char_type;

	explicit ctype(const mask* __tab = 0, bool __del = false, size_t __refs = 0);

	_LIBCPP_INLINE_VISIBILITY
	bool is(mask __m, char_type __c) const
	{
	return isascii(__c) ? (__tab_[static_cast<int>(__c)] & __m) !=0 : false;
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* is(const char_type* __low, const char_type* __high, mask* __vec) const
	{
	for (; __low != __high; ++__low, ++__vec)
	__vec = isascii(__low) ? __tab_[static_cast<int>(*__low)] : 0;
	return __low;
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* scan_is (mask __m, const char_type* __low, const char_type* __high) const
	{
	for (; __low != __high; ++__low)
	if (isascii(__low) && (__tab_[static_cast<int>(__low)] & __m))
	break;
	return __low;
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* scan_not(mask __m, const char_type* __low, const char_type* __high) const
	{
	for (; __low != __high; ++__low)
	if (!(isascii(__low) && (__tab_[static_cast<int>(__low)] & __m)))
	break;
	return __low;
	}

	_LIBCPP_INLINE_VISIBILITY
	char_type toupper(char_type __c) const
	{
	return do_toupper(__c);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* toupper(char_type* __low, const char_type* __high) const
	{
	return do_toupper(__low, __high);
	}

	_LIBCPP_INLINE_VISIBILITY
	char_type tolower(char_type __c) const
	{
	return do_tolower(__c);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char_type* tolower(char_type* __low, const char_type* __high) const
	{
	return do_tolower(__low, __high);
	}

	_LIBCPP_INLINE_VISIBILITY
	char_type widen(char __c) const
	{
	return do_widen(__c);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char* widen(const char* __low, const char* __high, char_type* __to) const
	{
	return do_widen(__low, __high, __to);
	}

	_LIBCPP_INLINE_VISIBILITY
	char narrow(char_type __c, char __dfault) const
	{
	return do_narrow(__c, __dfault);
	}

	_LIBCPP_INLINE_VISIBILITY
	const char* narrow(const char_type* __low, const char_type* __high, char __dfault, char* __to) const
	{
	return do_narrow(__low, __high, __dfault, __to);
	}

	static locale::id id;

	#ifdef _CACHED_RUNES
	static const size_t table_size = _CACHED_RUNES;
	#else
	static const size_t table_size = 256; // FIXME: Don't hardcode this.
	#endif
	_LIBCPP_INLINE_VISIBILITY const mask* table() const _NOEXCEPT {return __tab_;}
	static const mask* classic_table() _NOEXCEPT;
	#if defined(__GLIBC__) \|\| defined(__EMSCRIPTEN__)
	static const int* __classic_upper_table() _NOEXCEPT;
	static const int* __classic_lower_table() _NOEXCEPT;
	#endif
	#if defined(__NetBSD__)
	static const short* __classic_upper_table() _NOEXCEPT;
	static const short* __classic_lower_table() _NOEXCEPT;
	#endif

	protected:
	~ctype();
	virtual char_type do_toupper(char_type __c) const;
	virtual const char_type* do_toupper(char_type* __low, const char_type* __high) const;
	virtual char_type do_tolower(char_type __c) const;
	virtual const char_type* do_tolower(char_type* __low, const char_type* __high) const;
	virtual char_type do_widen(char __c) const;
	virtual const char* do_widen(const char* __low, const char* __high, char_type* __to) const;
	virtual char do_narrow(char_type __c, char __dfault) const;
	virtual const char* do_narrow(const char_type* __low, const char_type* __high, char __dfault, char* __to) const;
	};

	// template <class CharT> class ctype_byname;

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS ctype_byname;

	template <>
	class _LIBCPP_TYPE_VIS ctype_byname<char>
	: public ctype<char>
	{
	locale_t __l;

	public:
	explicit ctype_byname(const char*, size_t = 0);
	explicit ctype_byname(const string&, size_t = 0);

	protected:
	~ctype_byname();
	virtual char_type do_toupper(char_type) const;
	virtual const char_type* do_toupper(char_type* __low, const char_type* __high) const;
	virtual char_type do_tolower(char_type) const;
	virtual const char_type* do_tolower(char_type* __low, const char_type* __high) const;
	};

	template <>
	class _LIBCPP_TYPE_VIS ctype_byname<wchar_t>
	: public ctype<wchar_t>
	{
	locale_t __l;

	public:
	explicit ctype_byname(const char*, size_t = 0);
	explicit ctype_byname(const string&, size_t = 0);

	protected:
	~ctype_byname();
	virtual bool do_is(mask __m, char_type __c) const;
	virtual const char_type* do_is(const char_type* __low, const char_type* __high, mask* __vec) const;
	virtual const char_type* do_scan_is(mask __m, const char_type* __low, const char_type* __high) const;
	virtual const char_type* do_scan_not(mask __m, const char_type* __low, const char_type* __high) const;
	virtual char_type do_toupper(char_type) const;
	virtual const char_type* do_toupper(char_type* __low, const char_type* __high) const;
	virtual char_type do_tolower(char_type) const;
	virtual const char_type* do_tolower(char_type* __low, const char_type* __high) const;
	virtual char_type do_widen(char) const;
	virtual const char* do_widen(const char* __low, const char* __high, char_type* __dest) const;
	virtual char do_narrow(char_type, char __dfault) const;
	virtual const char_type* do_narrow(const char_type* __low, const char_type* __high, char __dfault, char* __dest) const;
	};

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isspace(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::space, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isprint(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::print, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	iscntrl(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::cntrl, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isupper(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::upper, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	islower(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::lower, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isalpha(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::alpha, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isdigit(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::digit, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	ispunct(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::punct, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isxdigit(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::xdigit, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isalnum(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::alnum, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isgraph(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::graph, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	_CharT
	toupper(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).toupper(__c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	_CharT
	tolower(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).tolower(__c);
	}

	// codecvt_base

	class _LIBCPP_TYPE_VIS codecvt_base
	{
	public:
	_LIBCPP_INLINE_VISIBILITY codecvt_base() {}
	enum result {ok, partial, error, noconv};
	};

	// template <class internT, class externT, class stateT> class codecvt;

	template <class _InternT, class _ExternT, class _StateT> class _LIBCPP_TEMPLATE_VIS codecvt;

	// template <> class codecvt<char, char, mbstate_t>

	template <>
	class _LIBCPP_TYPE_VIS codecvt<char, char, mbstate_t>
	: public locale::facet,
	public codecvt_base
	{
	public:
	typedef char intern_type;
	typedef char extern_type;
	typedef mbstate_t state_type;

	_LIBCPP_INLINE_VISIBILITY
	explicit codecvt(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_INLINE_VISIBILITY
	result out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	result unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_unshift(__st, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	result in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
	{
	return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	int encoding() const _NOEXCEPT
	{
	return do_encoding();
	}

	_LIBCPP_INLINE_VISIBILITY
	bool always_noconv() const _NOEXCEPT
	{
	return do_always_noconv();
	}

	_LIBCPP_INLINE_VISIBILITY
	int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
	{
	return do_length(__st, __frm, __end, __mx);
	}

	_LIBCPP_INLINE_VISIBILITY
	int max_length() const _NOEXCEPT
	{
	return do_max_length();
	}

	static locale::id id;

	protected:
	_LIBCPP_INLINE_VISIBILITY
	explicit codecvt(const char*, size_t __refs = 0)
	: locale::facet(__refs) {}

	~codecvt();

	virtual result do_out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual result do_in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
	virtual result do_unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual int do_encoding() const _NOEXCEPT;
	virtual bool do_always_noconv() const _NOEXCEPT;
	virtual int do_length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
	virtual int do_max_length() const _NOEXCEPT;
	};

	// template <> class codecvt<wchar_t, char, mbstate_t>

	template <>
	class _LIBCPP_TYPE_VIS codecvt<wchar_t, char, mbstate_t>
	: public locale::facet,
	public codecvt_base
	{
	locale_t __l;
	public:
	typedef wchar_t intern_type;
	typedef char extern_type;
	typedef mbstate_t state_type;

	explicit codecvt(size_t __refs = 0);

	_LIBCPP_INLINE_VISIBILITY
	result out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	result unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_unshift(__st, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	result in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
	{
	return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	int encoding() const _NOEXCEPT
	{
	return do_encoding();
	}

	_LIBCPP_INLINE_VISIBILITY
	bool always_noconv() const _NOEXCEPT
	{
	return do_always_noconv();
	}

	_LIBCPP_INLINE_VISIBILITY
	int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
	{
	return do_length(__st, __frm, __end, __mx);
	}

	_LIBCPP_INLINE_VISIBILITY
	int max_length() const _NOEXCEPT
	{
	return do_max_length();
	}

	static locale::id id;

	protected:
	explicit codecvt(const char*, size_t __refs = 0);

	~codecvt();

	virtual result do_out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual result do_in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
	virtual result do_unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual int do_encoding() const _NOEXCEPT;
	virtual bool do_always_noconv() const _NOEXCEPT;
	virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
	virtual int do_max_length() const _NOEXCEPT;
	};

	// template <> class codecvt<char16_t, char, mbstate_t>

	template <>
	class _LIBCPP_TYPE_VIS codecvt<char16_t, char, mbstate_t>
	: public locale::facet,
	public codecvt_base
	{
	public:
	typedef char16_t intern_type;
	typedef char extern_type;
	typedef mbstate_t state_type;

	_LIBCPP_INLINE_VISIBILITY
	explicit codecvt(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_INLINE_VISIBILITY
	result out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	result unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_unshift(__st, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	result in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
	{
	return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	int encoding() const _NOEXCEPT
	{
	return do_encoding();
	}

	_LIBCPP_INLINE_VISIBILITY
	bool always_noconv() const _NOEXCEPT
	{
	return do_always_noconv();
	}

	_LIBCPP_INLINE_VISIBILITY
	int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
	{
	return do_length(__st, __frm, __end, __mx);
	}

	_LIBCPP_INLINE_VISIBILITY
	int max_length() const _NOEXCEPT
	{
	return do_max_length();
	}

	static locale::id id;

	protected:
	_LIBCPP_INLINE_VISIBILITY
	explicit codecvt(const char*, size_t __refs = 0)
	: locale::facet(__refs) {}

	~codecvt();

	virtual result do_out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual result do_in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
	virtual result do_unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual int do_encoding() const _NOEXCEPT;
	virtual bool do_always_noconv() const _NOEXCEPT;
	virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
	virtual int do_max_length() const _NOEXCEPT;
	};

	// template <> class codecvt<char32_t, char, mbstate_t>

	template <>
	class _LIBCPP_TYPE_VIS codecvt<char32_t, char, mbstate_t>
	: public locale::facet,
	public codecvt_base
	{
	public:
	typedef char32_t intern_type;
	typedef char extern_type;
	typedef mbstate_t state_type;

	_LIBCPP_INLINE_VISIBILITY
	explicit codecvt(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_INLINE_VISIBILITY
	result out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	result unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_unshift(__st, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	result in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
	{
	return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_INLINE_VISIBILITY
	int encoding() const _NOEXCEPT
	{
	return do_encoding();
	}

	_LIBCPP_INLINE_VISIBILITY
	bool always_noconv() const _NOEXCEPT
	{
	return do_always_noconv();
	}

	_LIBCPP_INLINE_VISIBILITY
	int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
	{
	return do_length(__st, __frm, __end, __mx);
	}

	_LIBCPP_INLINE_VISIBILITY
	int max_length() const _NOEXCEPT
	{
	return do_max_length();
	}

	static locale::id id;

	protected:
	_LIBCPP_INLINE_VISIBILITY
	explicit codecvt(const char*, size_t __refs = 0)
	: locale::facet(__refs) {}

	~codecvt();

	virtual result do_out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual result do_in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
	virtual result do_unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual int do_encoding() const _NOEXCEPT;
	virtual bool do_always_noconv() const _NOEXCEPT;
	virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
	virtual int do_max_length() const _NOEXCEPT;
	};

	// template <class _InternT, class _ExternT, class _StateT> class codecvt_byname

	template <class _InternT, class _ExternT, class _StateT>
	class _LIBCPP_TEMPLATE_VIS codecvt_byname
	: public codecvt<_InternT, _ExternT, _StateT>
	{
	public:
	_LIBCPP_INLINE_VISIBILITY
	explicit codecvt_byname(const char* __nm, size_t __refs = 0)
	: codecvt<_InternT, _ExternT, _StateT>(__nm, __refs) {}
	_LIBCPP_INLINE_VISIBILITY
	explicit codecvt_byname(const string& __nm, size_t __refs = 0)
	: codecvt<_InternT, _ExternT, _StateT>(__nm.c_str(), __refs) {}
	protected:
	~codecvt_byname();
	};

	template <class _InternT, class _ExternT, class _StateT>
	codecvt_byname<_InternT, _ExternT, _StateT>::~codecvt_byname()
	{
	}

	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char, char, mbstate_t>)
	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<wchar_t, char, mbstate_t>)
	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char16_t, char, mbstate_t>)
	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char32_t, char, mbstate_t>)

	template <size_t _Np>
	struct __narrow_to_utf8
	{
	template <class _OutputIterator, class _CharT>
	_OutputIterator
	operator()(_OutputIterator __s, const _CharT* __wb, const _CharT* __we) const;
	};

	template <>
	struct __narrow_to_utf8<8>
	{
	template <class _OutputIterator, class _CharT>
	_LIBCPP_INLINE_VISIBILITY
	_OutputIterator
	operator()(_OutputIterator __s, const _CharT* __wb, const _CharT* __we) const
	{
	for (; __wb < __we; ++__wb, ++__s)
	__s = __wb;
	return __s;
	}
	};

	template <>
	struct _LIBCPP_TEMPLATE_VIS __narrow_to_utf8<16>
	: public codecvt<char16_t, char, mbstate_t>
	{
	_LIBCPP_INLINE_VISIBILITY
	__narrow_to_utf8() : codecvt<char16_t, char, mbstate_t>(1) {}

	_LIBCPP_EXPORTED_FROM_ABI ~__narrow_to_utf8();

	template <class _OutputIterator, class _CharT>
	_LIBCPP_INLINE_VISIBILITY
	_OutputIterator
	operator()(_OutputIterator __s, const _CharT* __wb, const _CharT* __we) const
	{
	result __r = ok;
	mbstate_t __mb;
	while (__wb < __we && __r != error)
	{
	const int __sz = 32;
	char __buf[__sz];
	char* __bn;
	const char16_t* __wn = (const char16_t*)__wb;
	__r = do_out(__mb, (const char16_t)__wb, (const char16_t)__we, __wn,
	__buf, __buf+__sz, __bn);
	if (__r == codecvt_base::error \|\| __wn == (const char16_t*)__wb)
	__throw_runtime_error("locale not supported");
	for (const char* __p = __buf; __p < __bn; ++__p, ++__s)
	__s = __p;
	__wb = (const _CharT*)__wn;
	}
	return __s;
	}
	};

	template <>
	struct _LIBCPP_TEMPLATE_VIS __narrow_to_utf8<32>
	: public codecvt<char32_t, char, mbstate_t>
	{
	_LIBCPP_INLINE_VISIBILITY
	__narrow_to_utf8() : codecvt<char32_t, char, mbstate_t>(1) {}

	_LIBCPP_EXPORTED_FROM_ABI ~__narrow_to_utf8();

	template <class _OutputIterator, class _CharT>
	_LIBCPP_INLINE_VISIBILITY
	_OutputIterator
	operator()(_OutputIterator __s, const _CharT* __wb, const _CharT* __we) const
	{
	result __r = ok;
	mbstate_t __mb;
	while (__wb < __we && __r != error)
	{
	const int __sz = 32;
	char __buf[__sz];
	char* __bn;
	const char32_t* __wn = (const char32_t*)__wb;
	__r = do_out(__mb, (const char32_t)__wb, (const char32_t)__we, __wn,
	__buf, __buf+__sz, __bn);
	if (__r == codecvt_base::error \|\| __wn == (const char32_t*)__wb)
	__throw_runtime_error("locale not supported");
	for (const char* __p = __buf; __p < __bn; ++__p, ++__s)
	__s = __p;
	__wb = (const _CharT*)__wn;
	}
	return __s;
	}
	};

	template <size_t _Np>
	struct __widen_from_utf8
	{
	template <class _OutputIterator>
	_OutputIterator
	operator()(_OutputIterator __s, const char* __nb, const char* __ne) const;
	};

	template <>
	struct __widen_from_utf8<8>
	{
	template <class _OutputIterator>
	_LIBCPP_INLINE_VISIBILITY
	_OutputIterator
	operator()(_OutputIterator __s, const char* __nb, const char* __ne) const
	{
	for (; __nb < __ne; ++__nb, ++__s)
	__s = __nb;
	return __s;
	}
	};

	template <>
	struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<16>
	: public codecvt<char16_t, char, mbstate_t>
	{
	_LIBCPP_INLINE_VISIBILITY
	__widen_from_utf8() : codecvt<char16_t, char, mbstate_t>(1) {}

	_LIBCPP_EXPORTED_FROM_ABI ~__widen_from_utf8();

	template <class _OutputIterator>
	_LIBCPP_INLINE_VISIBILITY
	_OutputIterator
	operator()(_OutputIterator __s, const char* __nb, const char* __ne) const
	{
	result __r = ok;
	mbstate_t __mb;
	while (__nb < __ne && __r != error)
	{
	const int __sz = 32;
	char16_t __buf[__sz];
	char16_t* __bn;
	const char* __nn = __nb;
	__r = do_in(__mb, __nb, __ne - __nb > __sz ? __nb+__sz : __ne, __nn,
	__buf, __buf+__sz, __bn);
	if (__r == codecvt_base::error \|\| __nn == __nb)
	__throw_runtime_error("locale not supported");
	for (const char16_t* __p = __buf; __p < __bn; ++__p, ++__s)
	__s = (wchar_t)__p;
	__nb = __nn;
	}
	return __s;
	}
	};

	template <>
	struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<32>
	: public codecvt<char32_t, char, mbstate_t>
	{
	_LIBCPP_INLINE_VISIBILITY
	__widen_from_utf8() : codecvt<char32_t, char, mbstate_t>(1) {}

	_LIBCPP_EXPORTED_FROM_ABI ~__widen_from_utf8();

	template <class _OutputIterator>
	_LIBCPP_INLINE_VISIBILITY
	_OutputIterator
	operator()(_OutputIterator __s, const char* __nb, const char* __ne) const
	{
	result __r = ok;
	mbstate_t __mb;
	while (__nb < __ne && __r != error)
	{
	const int __sz = 32;
	char32_t __buf[__sz];
	char32_t* __bn;
	const char* __nn = __nb;
	__r = do_in(__mb, __nb, __ne - __nb > __sz ? __nb+__sz : __ne, __nn,
	__buf, __buf+__sz, __bn);
	if (__r == codecvt_base::error \|\| __nn == __nb)
	__throw_runtime_error("locale not supported");
	for (const char32_t* __p = __buf; __p < __bn; ++__p, ++__s)
	__s = (wchar_t)__p;
	__nb = __nn;
	}
	return __s;
	}
	};

	// template <class charT> class numpunct

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS numpunct;

	template <>
	class _LIBCPP_TYPE_VIS numpunct<char>
	: public locale::facet
	{
	public:
	typedef char char_type;
	typedef basic_string<char_type> string_type;

	explicit numpunct(size_t __refs = 0);

	_LIBCPP_INLINE_VISIBILITY char_type decimal_point() const {return do_decimal_point();}
	_LIBCPP_INLINE_VISIBILITY char_type thousands_sep() const {return do_thousands_sep();}
	_LIBCPP_INLINE_VISIBILITY string grouping() const {return do_grouping();}
	_LIBCPP_INLINE_VISIBILITY string_type truename() const {return do_truename();}
	_LIBCPP_INLINE_VISIBILITY string_type falsename() const {return do_falsename();}

	static locale::id id;

	protected:
	~numpunct();
	virtual char_type do_decimal_point() const;
	virtual char_type do_thousands_sep() const;
	virtual string do_grouping() const;
	virtual string_type do_truename() const;
	virtual string_type do_falsename() const;

	char_type __decimal_point_;
	char_type __thousands_sep_;
	string __grouping_;
	};

	template <>
	class _LIBCPP_TYPE_VIS numpunct<wchar_t>
	: public locale::facet
	{
	public:
	typedef wchar_t char_type;
	typedef basic_string<char_type> string_type;

	explicit numpunct(size_t __refs = 0);

	_LIBCPP_INLINE_VISIBILITY char_type decimal_point() const {return do_decimal_point();}
	_LIBCPP_INLINE_VISIBILITY char_type thousands_sep() const {return do_thousands_sep();}
	_LIBCPP_INLINE_VISIBILITY string grouping() const {return do_grouping();}
	_LIBCPP_INLINE_VISIBILITY string_type truename() const {return do_truename();}
	_LIBCPP_INLINE_VISIBILITY string_type falsename() const {return do_falsename();}

	static locale::id id;

	protected:
	~numpunct();
	virtual char_type do_decimal_point() const;
	virtual char_type do_thousands_sep() const;
	virtual string do_grouping() const;
	virtual string_type do_truename() const;
	virtual string_type do_falsename() const;

	char_type __decimal_point_;
	char_type __thousands_sep_;
	string __grouping_;
	};

	// template <class charT> class numpunct_byname

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS numpunct_byname;

	template <>
	class _LIBCPP_TYPE_VIS numpunct_byname<char>
	: public numpunct<char>
	{
	public:
	typedef char char_type;
	typedef basic_string<char_type> string_type;

	explicit numpunct_byname(const char* __nm, size_t __refs = 0);
	explicit numpunct_byname(const string& __nm, size_t __refs = 0);

	protected:
	~numpunct_byname();

	private:
	void __init(const char*);
	};

	template <>
	class _LIBCPP_TYPE_VIS numpunct_byname<wchar_t>
	: public numpunct<wchar_t>
	{
	public:
	typedef wchar_t char_type;
	typedef basic_string<char_type> string_type;

	explicit numpunct_byname(const char* __nm, size_t __refs = 0);
	explicit numpunct_byname(const string& __nm, size_t __refs = 0);

	protected:
	~numpunct_byname();

	private:
	void __init(const char*);
	};

	_LIBCPP_END_NAMESPACE_STD

	#endif // _LIBCPP___LOCALE
	Index: projects/clang900-import/contrib/libc++/include/__threading_support
	===================================================================
	--- projects/clang900-import/contrib/libc++/include/__threading_support (revision 351721)
	+++ projects/clang900-import/contrib/libc++/include/__threading_support (revision 351722)
	@@ -1,484 +1,490 @@
	// -- C++ --
	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP_THREADING_SUPPORT
	#define _LIBCPP_THREADING_SUPPORT

	#include <__config>
	#include <chrono>
	#include <iosfwd>
	#include <errno.h>

	#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
	#pragma GCC system_header
	#endif

	#if defined(_LIBCPP_HAS_THREAD_API_EXTERNAL)
	# include <__external_threading>
	#elif !defined(_LIBCPP_HAS_NO_THREADS)

	-typedef ::timespec __libcpp_timespec_t;
	-
	#if defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
	# include <pthread.h>
	# include <sched.h>
	#endif

	-_LIBCPP_PUSH_MACROS
	-#include <__undef_macros>
	-
	#if defined(_LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL) \|\| \
	defined(_LIBCPP_BUILDING_THREAD_LIBRARY_EXTERNAL) \|\| \
	defined(_LIBCPP_HAS_THREAD_API_WIN32)
	#define _LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_FUNC_VIS
	#else
	#define _LIBCPP_THREAD_ABI_VISIBILITY inline _LIBCPP_INLINE_VISIBILITY
	#endif

	#if defined(__FreeBSD__) && defined(__clang__) && __has_attribute(no_thread_safety_analysis)
	#define _LIBCPP_NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis))
	#else
	#define _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	#endif

	+typedef ::timespec __libcpp_timespec_t;
	+#endif // !defined(_LIBCPP_HAS_NO_THREADS)
	+
	+_LIBCPP_PUSH_MACROS
	+#include <__undef_macros>
	+
	_LIBCPP_BEGIN_NAMESPACE_STD

	+#if !defined(_LIBCPP_HAS_NO_THREADS)
	+
	#if defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
	// Mutex
	typedef pthread_mutex_t __libcpp_mutex_t;
	#define _LIBCPP_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER

	typedef pthread_mutex_t __libcpp_recursive_mutex_t;

	// Condition Variable
	typedef pthread_cond_t __libcpp_condvar_t;
	#define _LIBCPP_CONDVAR_INITIALIZER PTHREAD_COND_INITIALIZER

	// Execute once
	typedef pthread_once_t __libcpp_exec_once_flag;
	#define _LIBCPP_EXEC_ONCE_INITIALIZER PTHREAD_ONCE_INIT

	// Thread id
	typedef pthread_t __libcpp_thread_id;

	// Thread
	#define _LIBCPP_NULL_THREAD 0U

	typedef pthread_t __libcpp_thread_t;

	// Thread Local Storage
	typedef pthread_key_t __libcpp_tls_key;

	#define _LIBCPP_TLS_DESTRUCTOR_CC
	-#else
	+#elif !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL)
	// Mutex
	typedef void* __libcpp_mutex_t;
	#define _LIBCPP_MUTEX_INITIALIZER 0

	#if defined(_M_IX86) \|\| defined(__i386__) \|\| defined(_M_ARM) \|\| defined(__arm__)
	typedef void* __libcpp_recursive_mutex_t[6];
	#elif defined(_M_AMD64) \|\| defined(__x86_64__) \|\| defined(_M_ARM64) \|\| defined(__aarch64__)
	typedef void* __libcpp_recursive_mutex_t[5];
	#else
	# error Unsupported architecture
	#endif

	// Condition Variable
	typedef void* __libcpp_condvar_t;
	#define _LIBCPP_CONDVAR_INITIALIZER 0

	// Execute Once
	typedef void* __libcpp_exec_once_flag;
	#define _LIBCPP_EXEC_ONCE_INITIALIZER 0

	// Thread ID
	typedef long __libcpp_thread_id;

	// Thread
	#define _LIBCPP_NULL_THREAD 0U

	typedef void* __libcpp_thread_t;

	// Thread Local Storage
	typedef long __libcpp_tls_key;

	#define _LIBCPP_TLS_DESTRUCTOR_CC __stdcall
	-#endif
	+#endif // !defined(_LIBCPP_HAS_THREAD_API_PTHREAD) && !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL)

	+#if !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL)
	// Mutex
	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_recursive_mutex_init(__libcpp_recursive_mutex_t *__m);

	_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	int __libcpp_recursive_mutex_lock(__libcpp_recursive_mutex_t *__m);

	_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	bool __libcpp_recursive_mutex_trylock(__libcpp_recursive_mutex_t *__m);

	_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	int __libcpp_recursive_mutex_unlock(__libcpp_recursive_mutex_t *__m);

	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_recursive_mutex_destroy(__libcpp_recursive_mutex_t *__m);

	_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	int __libcpp_mutex_lock(__libcpp_mutex_t *__m);

	_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	bool __libcpp_mutex_trylock(__libcpp_mutex_t *__m);

	_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	int __libcpp_mutex_unlock(__libcpp_mutex_t *__m);

	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_mutex_destroy(__libcpp_mutex_t *__m);

	// Condition variable
	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_condvar_signal(__libcpp_condvar_t* __cv);

	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_condvar_broadcast(__libcpp_condvar_t* __cv);

	_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	int __libcpp_condvar_wait(__libcpp_condvar_t* __cv, __libcpp_mutex_t* __m);

	_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
	int __libcpp_condvar_timedwait(__libcpp_condvar_t __cv, __libcpp_mutex_t __m,
	__libcpp_timespec_t *__ts);

	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_condvar_destroy(__libcpp_condvar_t* __cv);

	// Execute once
	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_execute_once(__libcpp_exec_once_flag *flag,
	void (*init_routine)());

	// Thread id
	_LIBCPP_THREAD_ABI_VISIBILITY
	bool __libcpp_thread_id_equal(__libcpp_thread_id t1, __libcpp_thread_id t2);

	_LIBCPP_THREAD_ABI_VISIBILITY
	bool __libcpp_thread_id_less(__libcpp_thread_id t1, __libcpp_thread_id t2);

	// Thread
	_LIBCPP_THREAD_ABI_VISIBILITY
	bool __libcpp_thread_isnull(const __libcpp_thread_t *__t);

	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_thread_create(__libcpp_thread_t __t, void (__func)(void ),
	void *__arg);

	_LIBCPP_THREAD_ABI_VISIBILITY
	__libcpp_thread_id __libcpp_thread_get_current_id();

	_LIBCPP_THREAD_ABI_VISIBILITY
	__libcpp_thread_id __libcpp_thread_get_id(const __libcpp_thread_t *__t);

	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_thread_join(__libcpp_thread_t *__t);

	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_thread_detach(__libcpp_thread_t *__t);

	_LIBCPP_THREAD_ABI_VISIBILITY
	void __libcpp_thread_yield();

	_LIBCPP_THREAD_ABI_VISIBILITY
	void __libcpp_thread_sleep_for(const chrono::nanoseconds& __ns);

	// Thread local storage
	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_tls_create(__libcpp_tls_key* __key,
	void(_LIBCPP_TLS_DESTRUCTOR_CC* __at_exit)(void*));

	_LIBCPP_THREAD_ABI_VISIBILITY
	void *__libcpp_tls_get(__libcpp_tls_key __key);

	_LIBCPP_THREAD_ABI_VISIBILITY
	int __libcpp_tls_set(__libcpp_tls_key __key, void *__p);

	+#endif // !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL)
	+
	#if (!defined(_LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL) \|\| \
	defined(_LIBCPP_BUILDING_THREAD_LIBRARY_EXTERNAL)) && \
	defined(_LIBCPP_HAS_THREAD_API_PTHREAD)

	int __libcpp_recursive_mutex_init(__libcpp_recursive_mutex_t *__m)
	{
	pthread_mutexattr_t attr;
	int __ec = pthread_mutexattr_init(&attr);
	if (__ec)
	return __ec;
	__ec = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
	if (__ec) {
	pthread_mutexattr_destroy(&attr);
	return __ec;
	}
	__ec = pthread_mutex_init(__m, &attr);
	if (__ec) {
	pthread_mutexattr_destroy(&attr);
	return __ec;
	}
	__ec = pthread_mutexattr_destroy(&attr);
	if (__ec) {
	pthread_mutex_destroy(__m);
	return __ec;
	}
	return 0;
	}

	int __libcpp_recursive_mutex_lock(__libcpp_recursive_mutex_t *__m)
	{
	return pthread_mutex_lock(__m);
	}

	bool __libcpp_recursive_mutex_trylock(__libcpp_recursive_mutex_t *__m)
	{
	return pthread_mutex_trylock(__m) == 0;
	}

	int __libcpp_recursive_mutex_unlock(__libcpp_mutex_t *__m)
	{
	return pthread_mutex_unlock(__m);
	}

	int __libcpp_recursive_mutex_destroy(__libcpp_recursive_mutex_t *__m)
	{
	return pthread_mutex_destroy(__m);
	}

	int __libcpp_mutex_lock(__libcpp_mutex_t *__m)
	{
	return pthread_mutex_lock(__m);
	}

	bool __libcpp_mutex_trylock(__libcpp_mutex_t *__m)
	{
	return pthread_mutex_trylock(__m) == 0;
	}

	int __libcpp_mutex_unlock(__libcpp_mutex_t *__m)
	{
	return pthread_mutex_unlock(__m);
	}

	int __libcpp_mutex_destroy(__libcpp_mutex_t *__m)
	{
	return pthread_mutex_destroy(__m);
	}

	// Condition Variable
	int __libcpp_condvar_signal(__libcpp_condvar_t *__cv)
	{
	return pthread_cond_signal(__cv);
	}

	int __libcpp_condvar_broadcast(__libcpp_condvar_t *__cv)
	{
	return pthread_cond_broadcast(__cv);
	}

	int __libcpp_condvar_wait(__libcpp_condvar_t __cv, __libcpp_mutex_t __m)
	{
	return pthread_cond_wait(__cv, __m);
	}

	int __libcpp_condvar_timedwait(__libcpp_condvar_t __cv, __libcpp_mutex_t __m,
	__libcpp_timespec_t *__ts)
	{
	return pthread_cond_timedwait(__cv, __m, __ts);
	}

	int __libcpp_condvar_destroy(__libcpp_condvar_t *__cv)
	{
	return pthread_cond_destroy(__cv);
	}

	// Execute once
	int __libcpp_execute_once(__libcpp_exec_once_flag *flag,
	void (*init_routine)()) {
	return pthread_once(flag, init_routine);
	}

	// Thread id
	// Returns non-zero if the thread ids are equal, otherwise 0
	bool __libcpp_thread_id_equal(__libcpp_thread_id t1, __libcpp_thread_id t2)
	{
	return pthread_equal(t1, t2) != 0;
	}

	// Returns non-zero if t1 < t2, otherwise 0
	bool __libcpp_thread_id_less(__libcpp_thread_id t1, __libcpp_thread_id t2)
	{
	return t1 < t2;
	}

	// Thread
	bool __libcpp_thread_isnull(const __libcpp_thread_t *__t) {
	return *__t == 0;
	}

	int __libcpp_thread_create(__libcpp_thread_t __t, void (__func)(void ),
	void *__arg)
	{
	return pthread_create(__t, 0, __func, __arg);
	}

	__libcpp_thread_id __libcpp_thread_get_current_id()
	{
	return pthread_self();
	}

	__libcpp_thread_id __libcpp_thread_get_id(const __libcpp_thread_t *__t)
	{
	return *__t;
	}

	int __libcpp_thread_join(__libcpp_thread_t *__t)
	{
	return pthread_join(*__t, 0);
	}

	int __libcpp_thread_detach(__libcpp_thread_t *__t)
	{
	return pthread_detach(*__t);
	}

	void __libcpp_thread_yield()
	{
	sched_yield();
	}

	void __libcpp_thread_sleep_for(const chrono::nanoseconds& __ns)
	{
	using namespace chrono;
	seconds __s = duration_cast<seconds>(__ns);
	__libcpp_timespec_t __ts;
	typedef decltype(__ts.tv_sec) ts_sec;
	_LIBCPP_CONSTEXPR ts_sec __ts_sec_max = numeric_limits<ts_sec>::max();

	if (__s.count() < __ts_sec_max)
	{
	__ts.tv_sec = static_cast<ts_sec>(__s.count());
	__ts.tv_nsec = static_cast<decltype(__ts.tv_nsec)>((__ns - __s).count());
	}
	else
	{
	__ts.tv_sec = __ts_sec_max;
	__ts.tv_nsec = 999999999; // (10^9 - 1)
	}

	while (nanosleep(&__ts, &__ts) == -1 && errno == EINTR);
	}

	// Thread local storage
	int __libcpp_tls_create(__libcpp_tls_key __key, void (__at_exit)(void *))
	{
	return pthread_key_create(__key, __at_exit);
	}

	void *__libcpp_tls_get(__libcpp_tls_key __key)
	{
	return pthread_getspecific(__key);
	}

	int __libcpp_tls_set(__libcpp_tls_key __key, void *__p)
	{
	return pthread_setspecific(__key, __p);
	}

	#endif // !_LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL \|\| _LIBCPP_BUILDING_THREAD_LIBRARY_EXTERNAL

	class _LIBCPP_TYPE_VIS thread;
	class _LIBCPP_TYPE_VIS __thread_id;

	namespace this_thread
	{

	_LIBCPP_INLINE_VISIBILITY __thread_id get_id() _NOEXCEPT;

	} // this_thread

	template<> struct hash<__thread_id>;

	class _LIBCPP_TEMPLATE_VIS __thread_id
	{
	// FIXME: pthread_t is a pointer on Darwin but a long on Linux.
	// NULL is the no-thread value on Darwin. Someone needs to check
	// on other platforms. We assume 0 works everywhere for now.
	__libcpp_thread_id __id_;

	public:
	_LIBCPP_INLINE_VISIBILITY
	__thread_id() _NOEXCEPT : __id_(0) {}

	friend _LIBCPP_INLINE_VISIBILITY
	bool operator==(__thread_id __x, __thread_id __y) _NOEXCEPT
	{ // don't pass id==0 to underlying routines
	if (__x.__id_ == 0) return __y.__id_ == 0;
	if (__y.__id_ == 0) return false;
	return __libcpp_thread_id_equal(__x.__id_, __y.__id_);
	}
	friend _LIBCPP_INLINE_VISIBILITY
	bool operator!=(__thread_id __x, __thread_id __y) _NOEXCEPT
	{return !(__x == __y);}
	friend _LIBCPP_INLINE_VISIBILITY
	bool operator< (__thread_id __x, __thread_id __y) _NOEXCEPT
	{ // id==0 is always less than any other thread_id
	if (__x.__id_ == 0) return __y.__id_ != 0;
	if (__y.__id_ == 0) return false;
	return __libcpp_thread_id_less(__x.__id_, __y.__id_);
	}
	friend _LIBCPP_INLINE_VISIBILITY
	bool operator<=(__thread_id __x, __thread_id __y) _NOEXCEPT
	{return !(__y < __x);}
	friend _LIBCPP_INLINE_VISIBILITY
	bool operator> (__thread_id __x, __thread_id __y) _NOEXCEPT
	{return __y < __x ;}
	friend _LIBCPP_INLINE_VISIBILITY
	bool operator>=(__thread_id __x, __thread_id __y) _NOEXCEPT
	{return !(__x < __y);}

	_LIBCPP_INLINE_VISIBILITY
	void __reset() { __id_ = 0; }

	template<class _CharT, class _Traits>
	friend
	_LIBCPP_INLINE_VISIBILITY
	basic_ostream<_CharT, _Traits>&
	operator<<(basic_ostream<_CharT, _Traits>& __os, __thread_id __id);

	private:
	_LIBCPP_INLINE_VISIBILITY
	__thread_id(__libcpp_thread_id __id) : __id_(__id) {}

	friend __thread_id this_thread::get_id() _NOEXCEPT;
	friend class _LIBCPP_TYPE_VIS thread;
	friend struct _LIBCPP_TEMPLATE_VIS hash<__thread_id>;
	};

	namespace this_thread
	{

	inline _LIBCPP_INLINE_VISIBILITY
	__thread_id
	get_id() _NOEXCEPT
	{
	return __libcpp_thread_get_current_id();
	}

	} // this_thread

	+#endif // !_LIBCPP_HAS_NO_THREADS
	+
	_LIBCPP_END_NAMESPACE_STD

	_LIBCPP_POP_MACROS
	-
	-#endif // !_LIBCPP_HAS_NO_THREADS

	#endif // _LIBCPP_THREADING_SUPPORT
	Index: projects/clang900-import/contrib/libc++
	===================================================================
	--- projects/clang900-import/contrib/libc++ (revision 351721)
	+++ projects/clang900-import/contrib/libc++ (revision 351722)

	Property changes on: projects/clang900-import/contrib/libc++
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/libc++/dist-release_90:r351684-351721
	Index: projects/clang900-import/contrib/libunwind
	===================================================================
	--- projects/clang900-import/contrib/libunwind (revision 351721)
	+++ projects/clang900-import/contrib/libunwind (revision 351722)

	Property changes on: projects/clang900-import/contrib/libunwind
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm-libunwind/dist-release_90:r351684-351721
	Index: projects/clang900-import/contrib/llvm/include/llvm/Analysis/InstructionSimplify.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/include/llvm/Analysis/InstructionSimplify.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/include/llvm/Analysis/InstructionSimplify.h (revision 351722)
	@@ -1,296 +1,299 @@
	//===-- InstructionSimplify.h - Fold instrs into simpler forms --- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares routines for folding instructions into simpler forms
	// that do not require creating new instructions. This does constant folding
	// ("add i32 1, 1" -> "2") but can also handle non-constant operands, either
	// returning a constant ("and i32 %x, 0" -> "0") or an already existing value
	// ("and i32 %x, %x" -> "%x"). If the simplification is also an instruction
	// then it dominates the original instruction.
	//
	// These routines implicitly resolve undef uses. The easiest way to be safe when
	// using these routines to obtain simplified values for existing instructions is
	// to always replace all uses of the instructions with the resulting simplified
	// values. This will prevent other code from seeing the same undef uses and
	// resolving them to different values.
	//
	// These routines are designed to tolerate moderately incomplete IR, such as
	// instructions that are not connected to basic blocks yet. However, they do
	// require that all the IR that they encounter be valid. In particular, they
	// require that all non-constant values be defined in the same function, and the
	// same call context of that function (and not split between caller and callee
	// contexts of a directly recursive call, for example).
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
	#define LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H

	+#include "llvm/ADT/SetVector.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/User.h"

	namespace llvm {
	class Function;
	template <typename T, typename... TArgs> class AnalysisManager;
	template <class T> class ArrayRef;
	class AssumptionCache;
	class CallBase;
	class DominatorTree;
	class DataLayout;
	class FastMathFlags;
	struct LoopStandardAnalysisResults;
	class OptimizationRemarkEmitter;
	class Pass;
	class TargetLibraryInfo;
	class Type;
	class Value;
	class MDNode;
	class BinaryOperator;

	/// InstrInfoQuery provides an interface to query additional information for
	/// instructions like metadata or keywords like nsw, which provides conservative
	/// results if the users specified it is safe to use.
	struct InstrInfoQuery {
	InstrInfoQuery(bool UMD) : UseInstrInfo(UMD) {}
	InstrInfoQuery() : UseInstrInfo(true) {}
	bool UseInstrInfo = true;

	MDNode getMetadata(const Instruction I, unsigned KindID) const {
	if (UseInstrInfo)
	return I->getMetadata(KindID);
	return nullptr;
	}

	template <class InstT> bool hasNoUnsignedWrap(const InstT *Op) const {
	if (UseInstrInfo)
	return Op->hasNoUnsignedWrap();
	return false;
	}

	template <class InstT> bool hasNoSignedWrap(const InstT *Op) const {
	if (UseInstrInfo)
	return Op->hasNoSignedWrap();
	return false;
	}

	bool isExact(const BinaryOperator *Op) const {
	if (UseInstrInfo && isa<PossiblyExactOperator>(Op))
	return cast<PossiblyExactOperator>(Op)->isExact();
	return false;
	}
	};

	struct SimplifyQuery {
	const DataLayout &DL;
	const TargetLibraryInfo *TLI = nullptr;
	const DominatorTree *DT = nullptr;
	AssumptionCache *AC = nullptr;
	const Instruction *CxtI = nullptr;

	// Wrapper to query additional information for instructions like metadata or
	// keywords like nsw, which provides conservative results if those cannot
	// be safely used.
	const InstrInfoQuery IIQ;

	SimplifyQuery(const DataLayout &DL, const Instruction *CXTI = nullptr)
	: DL(DL), CxtI(CXTI) {}

	SimplifyQuery(const DataLayout &DL, const TargetLibraryInfo *TLI,
	const DominatorTree *DT = nullptr,
	AssumptionCache *AC = nullptr,
	const Instruction *CXTI = nullptr, bool UseInstrInfo = true)
	: DL(DL), TLI(TLI), DT(DT), AC(AC), CxtI(CXTI), IIQ(UseInstrInfo) {}
	SimplifyQuery getWithInstruction(Instruction *I) const {
	SimplifyQuery Copy(*this);
	Copy.CxtI = I;
	return Copy;
	}
	};

	// NOTE: the explicit multiple argument versions of these functions are
	// deprecated.
	// Please use the SimplifyQuery versions in new code.

	/// Given operand for an FNeg, fold the result or return null.
	Value SimplifyFNegInst(Value Op, FastMathFlags FMF,
	const SimplifyQuery &Q);

	/// Given operands for an Add, fold the result or return null.
	Value SimplifyAddInst(Value LHS, Value *RHS, bool isNSW, bool isNUW,
	const SimplifyQuery &Q);

	/// Given operands for a Sub, fold the result or return null.
	Value SimplifySubInst(Value LHS, Value *RHS, bool isNSW, bool isNUW,
	const SimplifyQuery &Q);

	/// Given operands for an FAdd, fold the result or return null.
	Value SimplifyFAddInst(Value LHS, Value *RHS, FastMathFlags FMF,
	const SimplifyQuery &Q);

	/// Given operands for an FSub, fold the result or return null.
	Value SimplifyFSubInst(Value LHS, Value *RHS, FastMathFlags FMF,
	const SimplifyQuery &Q);

	/// Given operands for an FMul, fold the result or return null.
	Value SimplifyFMulInst(Value LHS, Value *RHS, FastMathFlags FMF,
	const SimplifyQuery &Q);

	/// Given operands for a Mul, fold the result or return null.
	Value SimplifyMulInst(Value LHS, Value *RHS, const SimplifyQuery &Q);

	/// Given operands for an SDiv, fold the result or return null.
	Value SimplifySDivInst(Value LHS, Value *RHS, const SimplifyQuery &Q);

	/// Given operands for a UDiv, fold the result or return null.
	Value SimplifyUDivInst(Value LHS, Value *RHS, const SimplifyQuery &Q);

	/// Given operands for an FDiv, fold the result or return null.
	Value SimplifyFDivInst(Value LHS, Value *RHS, FastMathFlags FMF,
	const SimplifyQuery &Q);

	/// Given operands for an SRem, fold the result or return null.
	Value SimplifySRemInst(Value LHS, Value *RHS, const SimplifyQuery &Q);

	/// Given operands for a URem, fold the result or return null.
	Value SimplifyURemInst(Value LHS, Value *RHS, const SimplifyQuery &Q);

	/// Given operands for an FRem, fold the result or return null.
	Value SimplifyFRemInst(Value LHS, Value *RHS, FastMathFlags FMF,
	const SimplifyQuery &Q);

	/// Given operands for a Shl, fold the result or return null.
	Value SimplifyShlInst(Value Op0, Value *Op1, bool isNSW, bool isNUW,
	const SimplifyQuery &Q);

	/// Given operands for a LShr, fold the result or return null.
	Value SimplifyLShrInst(Value Op0, Value *Op1, bool isExact,
	const SimplifyQuery &Q);

	/// Given operands for a AShr, fold the result or return nulll.
	Value SimplifyAShrInst(Value Op0, Value *Op1, bool isExact,
	const SimplifyQuery &Q);

	/// Given operands for an And, fold the result or return null.
	Value SimplifyAndInst(Value LHS, Value *RHS, const SimplifyQuery &Q);

	/// Given operands for an Or, fold the result or return null.
	Value SimplifyOrInst(Value LHS, Value *RHS, const SimplifyQuery &Q);

	/// Given operands for an Xor, fold the result or return null.
	Value SimplifyXorInst(Value LHS, Value *RHS, const SimplifyQuery &Q);

	/// Given operands for an ICmpInst, fold the result or return null.
	Value SimplifyICmpInst(unsigned Predicate, Value LHS, Value *RHS,
	const SimplifyQuery &Q);

	/// Given operands for an FCmpInst, fold the result or return null.
	Value SimplifyFCmpInst(unsigned Predicate, Value LHS, Value *RHS,
	FastMathFlags FMF, const SimplifyQuery &Q);

	/// Given operands for a SelectInst, fold the result or return null.
	Value SimplifySelectInst(Value Cond, Value TrueVal, Value FalseVal,
	const SimplifyQuery &Q);

	/// Given operands for a GetElementPtrInst, fold the result or return null.
	Value SimplifyGEPInst(Type SrcTy, ArrayRef<Value *> Ops,
	const SimplifyQuery &Q);

	/// Given operands for an InsertValueInst, fold the result or return null.
	Value SimplifyInsertValueInst(Value Agg, Value *Val, ArrayRef<unsigned> Idxs,
	const SimplifyQuery &Q);

	/// Given operands for an InsertElement, fold the result or return null.
	Value SimplifyInsertElementInst(Value Vec, Value Elt, Value Idx,
	const SimplifyQuery &Q);

	/// Given operands for an ExtractValueInst, fold the result or return null.
	Value SimplifyExtractValueInst(Value Agg, ArrayRef<unsigned> Idxs,
	const SimplifyQuery &Q);

	/// Given operands for an ExtractElementInst, fold the result or return null.
	Value SimplifyExtractElementInst(Value Vec, Value *Idx,
	const SimplifyQuery &Q);

	/// Given operands for a CastInst, fold the result or return null.
	Value SimplifyCastInst(unsigned CastOpc, Value Op, Type *Ty,
	const SimplifyQuery &Q);

	/// Given operands for a ShuffleVectorInst, fold the result or return null.
	Value SimplifyShuffleVectorInst(Value Op0, Value Op1, Constant Mask,
	Type *RetTy, const SimplifyQuery &Q);

	//=== Helper functions for higher up the class hierarchy.

	/// Given operands for a CmpInst, fold the result or return null.
	Value SimplifyCmpInst(unsigned Predicate, Value LHS, Value *RHS,
	const SimplifyQuery &Q);

	/// Given operand for a UnaryOperator, fold the result or return null.
	Value SimplifyUnOp(unsigned Opcode, Value Op, const SimplifyQuery &Q);

	/// Given operand for an FP UnaryOperator, fold the result or return null.
	/// In contrast to SimplifyUnOp, try to use FastMathFlag when folding the
	/// result. In case we don't need FastMathFlags, simply fall to SimplifyUnOp.
	Value SimplifyFPUnOp(unsigned Opcode, Value Op, FastMathFlags FMF,
	const SimplifyQuery &Q);

	/// Given operands for a BinaryOperator, fold the result or return null.
	Value SimplifyBinOp(unsigned Opcode, Value LHS, Value *RHS,
	const SimplifyQuery &Q);

	/// Given operands for an FP BinaryOperator, fold the result or return null.
	/// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
	/// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
	Value SimplifyFPBinOp(unsigned Opcode, Value LHS, Value *RHS,
	FastMathFlags FMF, const SimplifyQuery &Q);

	/// Given a callsite, fold the result or return null.
	Value SimplifyCall(CallBase Call, const SimplifyQuery &Q);

	/// See if we can compute a simplified version of this instruction. If not,
	/// return null.
	Value SimplifyInstruction(Instruction I, const SimplifyQuery &Q,
	OptimizationRemarkEmitter *ORE = nullptr);

	/// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively.
	///
	/// This first performs a normal RAUW of I with SimpleV. It then recursively
	/// attempts to simplify those users updated by the operation. The 'I'
	/// instruction must not be equal to the simplified value 'SimpleV'.
	+/// If UnsimplifiedUsers is provided, instructions that could not be simplified
	+/// are added to it.
	///
	/// The function returns true if any simplifications were performed.
	-bool replaceAndRecursivelySimplify(Instruction I, Value SimpleV,
	- const TargetLibraryInfo *TLI = nullptr,
	- const DominatorTree *DT = nullptr,
	- AssumptionCache *AC = nullptr);
	+bool replaceAndRecursivelySimplify(
	+ Instruction I, Value SimpleV, const TargetLibraryInfo *TLI = nullptr,
	+ const DominatorTree DT = nullptr, AssumptionCache AC = nullptr,
	+ SmallSetVector<Instruction , 8> UnsimplifiedUsers = nullptr);

	/// Recursively attempt to simplify an instruction.
	///
	/// This routine uses SimplifyInstruction to simplify 'I', and if successful
	/// replaces uses of 'I' with the simplified value. It then recurses on each
	/// of the users impacted. It returns true if any simplifications were
	/// performed.
	bool recursivelySimplifyInstruction(Instruction *I,
	const TargetLibraryInfo *TLI = nullptr,
	const DominatorTree *DT = nullptr,
	AssumptionCache *AC = nullptr);

	// These helper functions return a SimplifyQuery structure that contains as
	// many of the optional analysis we use as are currently valid. This is the
	// strongly preferred way of constructing SimplifyQuery in passes.
	const SimplifyQuery getBestSimplifyQuery(Pass &, Function &);
	template <class T, class... TArgs>
	const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &,
	Function &);
	const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &,
	const DataLayout &);
	} // end namespace llvm

	#endif

	Index: projects/clang900-import/contrib/llvm/include/llvm/IR/InlineAsm.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/include/llvm/IR/InlineAsm.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/include/llvm/IR/InlineAsm.h (revision 351722)
	@@ -1,365 +1,366 @@
	//===- llvm/InlineAsm.h - Class to represent inline asm strings -- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This class represents the inline asm strings, which are Value*'s that are
	// used as the callee operand of call instructions. InlineAsm's are uniqued
	// like constants, and created via InlineAsm::get(...).
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_IR_INLINEASM_H
	#define LLVM_IR_INLINEASM_H

	#include "llvm/ADT/StringRef.h"
	#include "llvm/IR/Value.h"
	#include <cassert>
	#include <string>
	#include <vector>

	namespace llvm {

	class FunctionType;
	class PointerType;
	template <class ConstantClass> class ConstantUniqueMap;

	class InlineAsm final : public Value {
	public:
	enum AsmDialect {
	AD_ATT,
	AD_Intel
	};

	private:
	friend struct InlineAsmKeyType;
	friend class ConstantUniqueMap<InlineAsm>;

	std::string AsmString, Constraints;
	FunctionType *FTy;
	bool HasSideEffects;
	bool IsAlignStack;
	AsmDialect Dialect;

	InlineAsm(FunctionType *Ty, const std::string &AsmString,
	const std::string &Constraints, bool hasSideEffects,
	bool isAlignStack, AsmDialect asmDialect);

	/// When the ConstantUniqueMap merges two types and makes two InlineAsms
	/// identical, it destroys one of them with this method.
	void destroyConstant();

	public:
	InlineAsm(const InlineAsm &) = delete;
	InlineAsm &operator=(const InlineAsm &) = delete;

	/// InlineAsm::get - Return the specified uniqued inline asm string.
	///
	static InlineAsm get(FunctionType Ty, StringRef AsmString,
	StringRef Constraints, bool hasSideEffects,
	bool isAlignStack = false,
	AsmDialect asmDialect = AD_ATT);

	bool hasSideEffects() const { return HasSideEffects; }
	bool isAlignStack() const { return IsAlignStack; }
	AsmDialect getDialect() const { return Dialect; }

	/// getType - InlineAsm's are always pointers.
	///
	PointerType *getType() const {
	return reinterpret_cast<PointerType*>(Value::getType());
	}

	/// getFunctionType - InlineAsm's are always pointers to functions.
	///
	FunctionType *getFunctionType() const;

	const std::string &getAsmString() const { return AsmString; }
	const std::string &getConstraintString() const { return Constraints; }

	/// Verify - This static method can be used by the parser to check to see if
	/// the specified constraint string is legal for the type. This returns true
	/// if legal, false if not.
	///
	static bool Verify(FunctionType *Ty, StringRef Constraints);

	// Constraint String Parsing
	enum ConstraintPrefix {
	isInput, // 'x'
	isOutput, // '=x'
	isClobber // '~x'
	};

	using ConstraintCodeVector = std::vector<std::string>;

	struct SubConstraintInfo {
	/// MatchingInput - If this is not -1, this is an output constraint where an
	/// input constraint is required to match it (e.g. "0"). The value is the
	/// constraint number that matches this one (for example, if this is
	/// constraint #0 and constraint #4 has the value "0", this will be 4).
	int MatchingInput = -1;

	/// Code - The constraint code, either the register name (in braces) or the
	/// constraint letter/number.
	ConstraintCodeVector Codes;

	/// Default constructor.
	SubConstraintInfo() = default;
	};

	using SubConstraintInfoVector = std::vector<SubConstraintInfo>;
	struct ConstraintInfo;
	using ConstraintInfoVector = std::vector<ConstraintInfo>;

	struct ConstraintInfo {
	/// Type - The basic type of the constraint: input/output/clobber
	///
	ConstraintPrefix Type = isInput;

	/// isEarlyClobber - "&": output operand writes result before inputs are all
	/// read. This is only ever set for an output operand.
	bool isEarlyClobber = false;

	/// MatchingInput - If this is not -1, this is an output constraint where an
	/// input constraint is required to match it (e.g. "0"). The value is the
	/// constraint number that matches this one (for example, if this is
	/// constraint #0 and constraint #4 has the value "0", this will be 4).
	int MatchingInput = -1;

	/// hasMatchingInput - Return true if this is an output constraint that has
	/// a matching input constraint.
	bool hasMatchingInput() const { return MatchingInput != -1; }

	/// isCommutative - This is set to true for a constraint that is commutative
	/// with the next operand.
	bool isCommutative = false;

	/// isIndirect - True if this operand is an indirect operand. This means
	/// that the address of the source or destination is present in the call
	/// instruction, instead of it being returned or passed in explicitly. This
	/// is represented with a '*' in the asm string.
	bool isIndirect = false;

	/// Code - The constraint code, either the register name (in braces) or the
	/// constraint letter/number.
	ConstraintCodeVector Codes;

	/// isMultipleAlternative - '\|': has multiple-alternative constraints.
	bool isMultipleAlternative = false;

	/// multipleAlternatives - If there are multiple alternative constraints,
	/// this array will contain them. Otherwise it will be empty.
	SubConstraintInfoVector multipleAlternatives;

	/// The currently selected alternative constraint index.
	unsigned currentAlternativeIndex = 0;

	/// Default constructor.
	ConstraintInfo() = default;

	/// Parse - Analyze the specified string (e.g. "=*&{eax}") and fill in the
	/// fields in this structure. If the constraint string is not understood,
	/// return true, otherwise return false.
	bool Parse(StringRef Str, ConstraintInfoVector &ConstraintsSoFar);

	/// selectAlternative - Point this constraint to the alternative constraint
	/// indicated by the index.
	void selectAlternative(unsigned index);
	};

	/// ParseConstraints - Split up the constraint string into the specific
	/// constraints and their prefixes. If this returns an empty vector, and if
	/// the constraint string itself isn't empty, there was an error parsing.
	static ConstraintInfoVector ParseConstraints(StringRef ConstraintString);

	/// ParseConstraints - Parse the constraints of this inlineasm object,
	/// returning them the same way that ParseConstraints(str) does.
	ConstraintInfoVector ParseConstraints() const {
	return ParseConstraints(Constraints);
	}

	// Methods for support type inquiry through isa, cast, and dyn_cast:
	static bool classof(const Value *V) {
	return V->getValueID() == Value::InlineAsmVal;
	}

	// These are helper methods for dealing with flags in the INLINEASM SDNode
	// in the backend.
	//
	// The encoding of the flag word is currently:
	// Bits 2-0 - A Kind_* value indicating the kind of the operand.
	// Bits 15-3 - The number of SDNode operands associated with this inline
	// assembly operand.
	// If bit 31 is set:
	// Bit 30-16 - The operand number that this operand must match.
	// When bits 2-0 are Kind_Mem, the Constraint_* value must be
	// obtained from the flags for this operand number.
	// Else if bits 2-0 are Kind_Mem:
	// Bit 30-16 - A Constraint_* value indicating the original constraint
	// code.
	// Else:
	// Bit 30-16 - The register class ID to use for the operand.

	enum : uint32_t {
	// Fixed operands on an INLINEASM SDNode.
	Op_InputChain = 0,
	Op_AsmString = 1,
	Op_MDNode = 2,
	Op_ExtraInfo = 3, // HasSideEffects, IsAlignStack, AsmDialect.
	Op_FirstOperand = 4,

	// Fixed operands on an INLINEASM MachineInstr.
	MIOp_AsmString = 0,
	MIOp_ExtraInfo = 1, // HasSideEffects, IsAlignStack, AsmDialect.
	MIOp_FirstOperand = 2,

	// Interpretation of the MIOp_ExtraInfo bit field.
	Extra_HasSideEffects = 1,
	Extra_IsAlignStack = 2,
	Extra_AsmDialect = 4,
	Extra_MayLoad = 8,
	Extra_MayStore = 16,
	Extra_IsConvergent = 32,

	// Inline asm operands map to multiple SDNode / MachineInstr operands.
	// The first operand is an immediate describing the asm operand, the low
	// bits is the kind:
	Kind_RegUse = 1, // Input register, "r".
	Kind_RegDef = 2, // Output register, "=r".
	Kind_RegDefEarlyClobber = 3, // Early-clobber output register, "=&r".
	Kind_Clobber = 4, // Clobbered register, "~r".
	Kind_Imm = 5, // Immediate.
	Kind_Mem = 6, // Memory operand, "m".

	// Memory constraint codes.
	// These could be tablegenerated but there's little need to do that since
	// there's plenty of space in the encoding to support the union of all
	// constraint codes for all targets.
	Constraint_Unknown = 0,
	Constraint_es,
	Constraint_i,
	Constraint_m,
	Constraint_o,
	Constraint_v,
	+ Constraint_A,
	Constraint_Q,
	Constraint_R,
	Constraint_S,
	Constraint_T,
	Constraint_Um,
	Constraint_Un,
	Constraint_Uq,
	Constraint_Us,
	Constraint_Ut,
	Constraint_Uv,
	Constraint_Uy,
	Constraint_X,
	Constraint_Z,
	Constraint_ZC,
	Constraint_Zy,
	Constraints_Max = Constraint_Zy,
	Constraints_ShiftAmount = 16,

	Flag_MatchingOperand = 0x80000000
	};

	static unsigned getFlagWord(unsigned Kind, unsigned NumOps) {
	assert(((NumOps << 3) & ~0xffff) == 0 && "Too many inline asm operands!");
	assert(Kind >= Kind_RegUse && Kind <= Kind_Mem && "Invalid Kind");
	return Kind \| (NumOps << 3);
	}

	static bool isRegDefKind(unsigned Flag){ return getKind(Flag) == Kind_RegDef;}
	static bool isImmKind(unsigned Flag) { return getKind(Flag) == Kind_Imm; }
	static bool isMemKind(unsigned Flag) { return getKind(Flag) == Kind_Mem; }
	static bool isRegDefEarlyClobberKind(unsigned Flag) {
	return getKind(Flag) == Kind_RegDefEarlyClobber;
	}
	static bool isClobberKind(unsigned Flag) {
	return getKind(Flag) == Kind_Clobber;
	}

	/// getFlagWordForMatchingOp - Augment an existing flag word returned by
	/// getFlagWord with information indicating that this input operand is tied
	/// to a previous output operand.
	static unsigned getFlagWordForMatchingOp(unsigned InputFlag,
	unsigned MatchedOperandNo) {
	assert(MatchedOperandNo <= 0x7fff && "Too big matched operand");
	assert((InputFlag & ~0xffff) == 0 && "High bits already contain data");
	return InputFlag \| Flag_MatchingOperand \| (MatchedOperandNo << 16);
	}

	/// getFlagWordForRegClass - Augment an existing flag word returned by
	/// getFlagWord with the required register class for the following register
	/// operands.
	/// A tied use operand cannot have a register class, use the register class
	/// from the def operand instead.
	static unsigned getFlagWordForRegClass(unsigned InputFlag, unsigned RC) {
	// Store RC + 1, reserve the value 0 to mean 'no register class'.
	++RC;
	assert(!isImmKind(InputFlag) && "Immediates cannot have a register class");
	assert(!isMemKind(InputFlag) && "Memory operand cannot have a register class");
	assert(RC <= 0x7fff && "Too large register class ID");
	assert((InputFlag & ~0xffff) == 0 && "High bits already contain data");
	return InputFlag \| (RC << 16);
	}

	/// Augment an existing flag word returned by getFlagWord with the constraint
	/// code for a memory constraint.
	static unsigned getFlagWordForMem(unsigned InputFlag, unsigned Constraint) {
	assert(isMemKind(InputFlag) && "InputFlag is not a memory constraint!");
	assert(Constraint <= 0x7fff && "Too large a memory constraint ID");
	assert(Constraint <= Constraints_Max && "Unknown constraint ID");
	assert((InputFlag & ~0xffff) == 0 && "High bits already contain data");
	return InputFlag \| (Constraint << Constraints_ShiftAmount);
	}

	static unsigned convertMemFlagWordToMatchingFlagWord(unsigned InputFlag) {
	assert(isMemKind(InputFlag));
	return InputFlag & ~(0x7fff << Constraints_ShiftAmount);
	}

	static unsigned getKind(unsigned Flags) {
	return Flags & 7;
	}

	static unsigned getMemoryConstraintID(unsigned Flag) {
	assert(isMemKind(Flag));
	return (Flag >> Constraints_ShiftAmount) & 0x7fff;
	}

	/// getNumOperandRegisters - Extract the number of registers field from the
	/// inline asm operand flag.
	static unsigned getNumOperandRegisters(unsigned Flag) {
	return (Flag & 0xffff) >> 3;
	}

	/// isUseOperandTiedToDef - Return true if the flag of the inline asm
	/// operand indicates it is an use operand that's matched to a def operand.
	static bool isUseOperandTiedToDef(unsigned Flag, unsigned &Idx) {
	if ((Flag & Flag_MatchingOperand) == 0)
	return false;
	Idx = (Flag & ~Flag_MatchingOperand) >> 16;
	return true;
	}

	/// hasRegClassConstraint - Returns true if the flag contains a register
	/// class constraint. Sets RC to the register class ID.
	static bool hasRegClassConstraint(unsigned Flag, unsigned &RC) {
	if (Flag & Flag_MatchingOperand)
	return false;
	unsigned High = Flag >> 16;
	// getFlagWordForRegClass() uses 0 to mean no register class, and otherwise
	// stores RC + 1.
	if (!High)
	return false;
	RC = High - 1;
	return true;
	}
	};

	} // end namespace llvm

	#endif // LLVM_IR_INLINEASM_H
	Index: projects/clang900-import/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Analysis/InstructionSimplify.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Analysis/InstructionSimplify.cpp (revision 351722)
	@@ -1,5326 +1,5332 @@
	//===- InstructionSimplify.cpp - Fold instruction operands ----------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements routines for folding instructions into simpler forms
	// that do not require creating new instructions. This does constant folding
	// ("add i32 1, 1" -> "2") but can also handle non-constant operands, either
	// returning a constant ("and i32 %x, 0" -> "0") or an already existing value
	// ("and i32 %x, %x" -> "%x"). All operands are assumed to have already been
	// simplified: This is usually true and assuming it simplifies the logic (if
	// they have not been simplified then results are correct but maybe suboptimal).
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/CaptureTracking.h"
	#include "llvm/Analysis/CmpInstAnalysis.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/LoopAnalysisManager.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/Support/KnownBits.h"
	#include <algorithm>
	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "instsimplify"

	enum { RecursionLimit = 3 };

	STATISTIC(NumExpand, "Number of expansions");
	STATISTIC(NumReassoc, "Number of reassociations");

	static Value SimplifyAndInst(Value , Value *, const SimplifyQuery &, unsigned);
	static Value simplifyUnOp(unsigned, Value , const SimplifyQuery &, unsigned);
	static Value simplifyFPUnOp(unsigned, Value , const FastMathFlags &,
	const SimplifyQuery &, unsigned);
	static Value SimplifyBinOp(unsigned, Value , Value *, const SimplifyQuery &,
	unsigned);
	static Value SimplifyFPBinOp(unsigned, Value , Value *, const FastMathFlags &,
	const SimplifyQuery &, unsigned);
	static Value SimplifyCmpInst(unsigned, Value , Value *, const SimplifyQuery &,
	unsigned);
	static Value SimplifyICmpInst(unsigned Predicate, Value LHS, Value *RHS,
	const SimplifyQuery &Q, unsigned MaxRecurse);
	static Value SimplifyOrInst(Value , Value *, const SimplifyQuery &, unsigned);
	static Value SimplifyXorInst(Value , Value *, const SimplifyQuery &, unsigned);
	static Value SimplifyCastInst(unsigned, Value , Type *,
	const SimplifyQuery &, unsigned);
	static Value SimplifyGEPInst(Type , ArrayRef<Value *>, const SimplifyQuery &,
	unsigned);

	static Value foldSelectWithBinaryOp(Value Cond, Value *TrueVal,
	Value *FalseVal) {
	BinaryOperator::BinaryOps BinOpCode;
	if (auto *BO = dyn_cast<BinaryOperator>(Cond))
	BinOpCode = BO->getOpcode();
	else
	return nullptr;

	CmpInst::Predicate ExpectedPred, Pred1, Pred2;
	if (BinOpCode == BinaryOperator::Or) {
	ExpectedPred = ICmpInst::ICMP_NE;
	} else if (BinOpCode == BinaryOperator::And) {
	ExpectedPred = ICmpInst::ICMP_EQ;
	} else
	return nullptr;

	// %A = icmp eq %TV, %FV
	// %B = icmp eq %X, %Y (and one of these is a select operand)
	// %C = and %A, %B
	// %D = select %C, %TV, %FV
	// -->
	// %FV

	// %A = icmp ne %TV, %FV
	// %B = icmp ne %X, %Y (and one of these is a select operand)
	// %C = or %A, %B
	// %D = select %C, %TV, %FV
	// -->
	// %TV
	Value X, Y;
	if (!match(Cond, m_c_BinOp(m_c_ICmp(Pred1, m_Specific(TrueVal),
	m_Specific(FalseVal)),
	m_ICmp(Pred2, m_Value(X), m_Value(Y)))) \|\|
	Pred1 != Pred2 \|\| Pred1 != ExpectedPred)
	return nullptr;

	if (X == TrueVal \|\| X == FalseVal \|\| Y == TrueVal \|\| Y == FalseVal)
	return BinOpCode == BinaryOperator::Or ? TrueVal : FalseVal;

	return nullptr;
	}

	/// For a boolean type or a vector of boolean type, return false or a vector
	/// with every element false.
	static Constant getFalse(Type Ty) {
	return ConstantInt::getFalse(Ty);
	}

	/// For a boolean type or a vector of boolean type, return true or a vector
	/// with every element true.
	static Constant getTrue(Type Ty) {
	return ConstantInt::getTrue(Ty);
	}

	/// isSameCompare - Is V equivalent to the comparison "LHS Pred RHS"?
	static bool isSameCompare(Value V, CmpInst::Predicate Pred, Value LHS,
	Value *RHS) {
	CmpInst *Cmp = dyn_cast<CmpInst>(V);
	if (!Cmp)
	return false;
	CmpInst::Predicate CPred = Cmp->getPredicate();
	Value CLHS = Cmp->getOperand(0), CRHS = Cmp->getOperand(1);
	if (CPred == Pred && CLHS == LHS && CRHS == RHS)
	return true;
	return CPred == CmpInst::getSwappedPredicate(Pred) && CLHS == RHS &&
	CRHS == LHS;
	}

	/// Does the given value dominate the specified phi node?
	static bool valueDominatesPHI(Value V, PHINode P, const DominatorTree *DT) {
	Instruction *I = dyn_cast<Instruction>(V);
	if (!I)
	// Arguments and constants dominate all instructions.
	return true;

	// If we are processing instructions (and/or basic blocks) that have not been
	// fully added to a function, the parent nodes may still be null. Simply
	// return the conservative answer in these cases.
	if (!I->getParent() \|\| !P->getParent() \|\| !I->getFunction())
	return false;

	// If we have a DominatorTree then do a precise test.
	if (DT)
	return DT->dominates(I, P);

	// Otherwise, if the instruction is in the entry block and is not an invoke,
	// then it obviously dominates all phi nodes.
	if (I->getParent() == &I->getFunction()->getEntryBlock() &&
	!isa<InvokeInst>(I))
	return true;

	return false;
	}

	/// Simplify "A op (B op' C)" by distributing op over op', turning it into
	/// "(A op B) op' (A op C)". Here "op" is given by Opcode and "op'" is
	/// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
	/// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
	/// Returns the simplified value, or null if no simplification was performed.
	static Value ExpandBinOp(Instruction::BinaryOps Opcode, Value LHS, Value *RHS,
	Instruction::BinaryOps OpcodeToExpand,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	// Recursion is always used, so bail out at once if we already hit the limit.
	if (!MaxRecurse--)
	return nullptr;

	// Check whether the expression has the form "(A op' B) op C".
	if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
	if (Op0->getOpcode() == OpcodeToExpand) {
	// It does! Try turning it into "(A op C) op' (B op C)".
	Value A = Op0->getOperand(0), B = Op0->getOperand(1), *C = RHS;
	// Do "A op C" and "B op C" both simplify?
	if (Value *L = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse))
	if (Value *R = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
	// They do! Return "L op' R" if it simplifies or is already available.
	// If "L op' R" equals "A op' B" then "L op' R" is just the LHS.
	if ((L == A && R == B) \|\| (Instruction::isCommutative(OpcodeToExpand)
	&& L == B && R == A)) {
	++NumExpand;
	return LHS;
	}
	// Otherwise return "L op' R" if it simplifies.
	if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
	++NumExpand;
	return V;
	}
	}
	}

	// Check whether the expression has the form "A op (B op' C)".
	if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
	if (Op1->getOpcode() == OpcodeToExpand) {
	// It does! Try turning it into "(A op B) op' (A op C)".
	Value A = LHS, B = Op1->getOperand(0), *C = Op1->getOperand(1);
	// Do "A op B" and "A op C" both simplify?
	if (Value *L = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse))
	if (Value *R = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse)) {
	// They do! Return "L op' R" if it simplifies or is already available.
	// If "L op' R" equals "B op' C" then "L op' R" is just the RHS.
	if ((L == B && R == C) \|\| (Instruction::isCommutative(OpcodeToExpand)
	&& L == C && R == B)) {
	++NumExpand;
	return RHS;
	}
	// Otherwise return "L op' R" if it simplifies.
	if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
	++NumExpand;
	return V;
	}
	}
	}

	return nullptr;
	}

	/// Generic simplifications for associative binary operations.
	/// Returns the simpler value, or null if none was found.
	static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
	Value LHS, Value RHS,
	const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");

	// Recursion is always used, so bail out at once if we already hit the limit.
	if (!MaxRecurse--)
	return nullptr;

	BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
	BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);

	// Transform: "(A op B) op C" ==> "A op (B op C)" if it simplifies completely.
	if (Op0 && Op0->getOpcode() == Opcode) {
	Value *A = Op0->getOperand(0);
	Value *B = Op0->getOperand(1);
	Value *C = RHS;

	// Does "B op C" simplify?
	if (Value *V = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
	// It does! Return "A op V" if it simplifies or is already available.
	// If V equals B then "A op V" is just the LHS.
	if (V == B) return LHS;
	// Otherwise return "A op V" if it simplifies.
	if (Value *W = SimplifyBinOp(Opcode, A, V, Q, MaxRecurse)) {
	++NumReassoc;
	return W;
	}
	}
	}

	// Transform: "A op (B op C)" ==> "(A op B) op C" if it simplifies completely.
	if (Op1 && Op1->getOpcode() == Opcode) {
	Value *A = LHS;
	Value *B = Op1->getOperand(0);
	Value *C = Op1->getOperand(1);

	// Does "A op B" simplify?
	if (Value *V = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse)) {
	// It does! Return "V op C" if it simplifies or is already available.
	// If V equals B then "V op C" is just the RHS.
	if (V == B) return RHS;
	// Otherwise return "V op C" if it simplifies.
	if (Value *W = SimplifyBinOp(Opcode, V, C, Q, MaxRecurse)) {
	++NumReassoc;
	return W;
	}
	}
	}

	// The remaining transforms require commutativity as well as associativity.
	if (!Instruction::isCommutative(Opcode))
	return nullptr;

	// Transform: "(A op B) op C" ==> "(C op A) op B" if it simplifies completely.
	if (Op0 && Op0->getOpcode() == Opcode) {
	Value *A = Op0->getOperand(0);
	Value *B = Op0->getOperand(1);
	Value *C = RHS;

	// Does "C op A" simplify?
	if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
	// It does! Return "V op B" if it simplifies or is already available.
	// If V equals A then "V op B" is just the LHS.
	if (V == A) return LHS;
	// Otherwise return "V op B" if it simplifies.
	if (Value *W = SimplifyBinOp(Opcode, V, B, Q, MaxRecurse)) {
	++NumReassoc;
	return W;
	}
	}
	}

	// Transform: "A op (B op C)" ==> "B op (C op A)" if it simplifies completely.
	if (Op1 && Op1->getOpcode() == Opcode) {
	Value *A = LHS;
	Value *B = Op1->getOperand(0);
	Value *C = Op1->getOperand(1);

	// Does "C op A" simplify?
	if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
	// It does! Return "B op V" if it simplifies or is already available.
	// If V equals C then "B op V" is just the RHS.
	if (V == C) return RHS;
	// Otherwise return "B op V" if it simplifies.
	if (Value *W = SimplifyBinOp(Opcode, B, V, Q, MaxRecurse)) {
	++NumReassoc;
	return W;
	}
	}
	}

	return nullptr;
	}

	/// In the case of a binary operation with a select instruction as an operand,
	/// try to simplify the binop by seeing whether evaluating it on both branches
	/// of the select results in the same value. Returns the common value if so,
	/// otherwise returns null.
	static Value ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value LHS,
	Value *RHS, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	// Recursion is always used, so bail out at once if we already hit the limit.
	if (!MaxRecurse--)
	return nullptr;

	SelectInst *SI;
	if (isa<SelectInst>(LHS)) {
	SI = cast<SelectInst>(LHS);
	} else {
	assert(isa<SelectInst>(RHS) && "No select instruction operand!");
	SI = cast<SelectInst>(RHS);
	}

	// Evaluate the BinOp on the true and false branches of the select.
	Value *TV;
	Value *FV;
	if (SI == LHS) {
	TV = SimplifyBinOp(Opcode, SI->getTrueValue(), RHS, Q, MaxRecurse);
	FV = SimplifyBinOp(Opcode, SI->getFalseValue(), RHS, Q, MaxRecurse);
	} else {
	TV = SimplifyBinOp(Opcode, LHS, SI->getTrueValue(), Q, MaxRecurse);
	FV = SimplifyBinOp(Opcode, LHS, SI->getFalseValue(), Q, MaxRecurse);
	}

	// If they simplified to the same value, then return the common value.
	// If they both failed to simplify then return null.
	if (TV == FV)
	return TV;

	// If one branch simplified to undef, return the other one.
	if (TV && isa<UndefValue>(TV))
	return FV;
	if (FV && isa<UndefValue>(FV))
	return TV;

	// If applying the operation did not change the true and false select values,
	// then the result of the binop is the select itself.
	if (TV == SI->getTrueValue() && FV == SI->getFalseValue())
	return SI;

	// If one branch simplified and the other did not, and the simplified
	// value is equal to the unsimplified one, return the simplified value.
	// For example, select (cond, X, X & Z) & Z -> X & Z.
	if ((FV && !TV) \|\| (TV && !FV)) {
	// Check that the simplified value has the form "X op Y" where "op" is the
	// same as the original operation.
	Instruction *Simplified = dyn_cast<Instruction>(FV ? FV : TV);
	if (Simplified && Simplified->getOpcode() == unsigned(Opcode)) {
	// The value that didn't simplify is "UnsimplifiedLHS op UnsimplifiedRHS".
	// We already know that "op" is the same as for the simplified value. See
	// if the operands match too. If so, return the simplified value.
	Value *UnsimplifiedBranch = FV ? SI->getTrueValue() : SI->getFalseValue();
	Value *UnsimplifiedLHS = SI == LHS ? UnsimplifiedBranch : LHS;
	Value *UnsimplifiedRHS = SI == LHS ? RHS : UnsimplifiedBranch;
	if (Simplified->getOperand(0) == UnsimplifiedLHS &&
	Simplified->getOperand(1) == UnsimplifiedRHS)
	return Simplified;
	if (Simplified->isCommutative() &&
	Simplified->getOperand(1) == UnsimplifiedLHS &&
	Simplified->getOperand(0) == UnsimplifiedRHS)
	return Simplified;
	}
	}

	return nullptr;
	}

	/// In the case of a comparison with a select instruction, try to simplify the
	/// comparison by seeing whether both branches of the select result in the same
	/// value. Returns the common value if so, otherwise returns null.
	static Value ThreadCmpOverSelect(CmpInst::Predicate Pred, Value LHS,
	Value *RHS, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	// Recursion is always used, so bail out at once if we already hit the limit.
	if (!MaxRecurse--)
	return nullptr;

	// Make sure the select is on the LHS.
	if (!isa<SelectInst>(LHS)) {
	std::swap(LHS, RHS);
	Pred = CmpInst::getSwappedPredicate(Pred);
	}
	assert(isa<SelectInst>(LHS) && "Not comparing with a select instruction!");
	SelectInst *SI = cast<SelectInst>(LHS);
	Value *Cond = SI->getCondition();
	Value *TV = SI->getTrueValue();
	Value *FV = SI->getFalseValue();

	// Now that we have "cmp select(Cond, TV, FV), RHS", analyse it.
	// Does "cmp TV, RHS" simplify?
	Value *TCmp = SimplifyCmpInst(Pred, TV, RHS, Q, MaxRecurse);
	if (TCmp == Cond) {
	// It not only simplified, it simplified to the select condition. Replace
	// it with 'true'.
	TCmp = getTrue(Cond->getType());
	} else if (!TCmp) {
	// It didn't simplify. However if "cmp TV, RHS" is equal to the select
	// condition then we can replace it with 'true'. Otherwise give up.
	if (!isSameCompare(Cond, Pred, TV, RHS))
	return nullptr;
	TCmp = getTrue(Cond->getType());
	}

	// Does "cmp FV, RHS" simplify?
	Value *FCmp = SimplifyCmpInst(Pred, FV, RHS, Q, MaxRecurse);
	if (FCmp == Cond) {
	// It not only simplified, it simplified to the select condition. Replace
	// it with 'false'.
	FCmp = getFalse(Cond->getType());
	} else if (!FCmp) {
	// It didn't simplify. However if "cmp FV, RHS" is equal to the select
	// condition then we can replace it with 'false'. Otherwise give up.
	if (!isSameCompare(Cond, Pred, FV, RHS))
	return nullptr;
	FCmp = getFalse(Cond->getType());
	}

	// If both sides simplified to the same value, then use it as the result of
	// the original comparison.
	if (TCmp == FCmp)
	return TCmp;

	// The remaining cases only make sense if the select condition has the same
	// type as the result of the comparison, so bail out if this is not so.
	if (Cond->getType()->isVectorTy() != RHS->getType()->isVectorTy())
	return nullptr;
	// If the false value simplified to false, then the result of the compare
	// is equal to "Cond && TCmp". This also catches the case when the false
	// value simplified to false and the true value to true, returning "Cond".
	if (match(FCmp, m_Zero()))
	if (Value *V = SimplifyAndInst(Cond, TCmp, Q, MaxRecurse))
	return V;
	// If the true value simplified to true, then the result of the compare
	// is equal to "Cond \|\| FCmp".
	if (match(TCmp, m_One()))
	if (Value *V = SimplifyOrInst(Cond, FCmp, Q, MaxRecurse))
	return V;
	// Finally, if the false value simplified to true and the true value to
	// false, then the result of the compare is equal to "!Cond".
	if (match(FCmp, m_One()) && match(TCmp, m_Zero()))
	if (Value *V =
	SimplifyXorInst(Cond, Constant::getAllOnesValue(Cond->getType()),
	Q, MaxRecurse))
	return V;

	return nullptr;
	}

	/// In the case of a binary operation with an operand that is a PHI instruction,
	/// try to simplify the binop by seeing whether evaluating it on the incoming
	/// phi values yields the same result for every value. If so returns the common
	/// value, otherwise returns null.
	static Value ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value LHS,
	Value *RHS, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	// Recursion is always used, so bail out at once if we already hit the limit.
	if (!MaxRecurse--)
	return nullptr;

	PHINode *PI;
	if (isa<PHINode>(LHS)) {
	PI = cast<PHINode>(LHS);
	// Bail out if RHS and the phi may be mutually interdependent due to a loop.
	if (!valueDominatesPHI(RHS, PI, Q.DT))
	return nullptr;
	} else {
	assert(isa<PHINode>(RHS) && "No PHI instruction operand!");
	PI = cast<PHINode>(RHS);
	// Bail out if LHS and the phi may be mutually interdependent due to a loop.
	if (!valueDominatesPHI(LHS, PI, Q.DT))
	return nullptr;
	}

	// Evaluate the BinOp on the incoming phi values.
	Value *CommonValue = nullptr;
	for (Value *Incoming : PI->incoming_values()) {
	// If the incoming value is the phi node itself, it can safely be skipped.
	if (Incoming == PI) continue;
	Value *V = PI == LHS ?
	SimplifyBinOp(Opcode, Incoming, RHS, Q, MaxRecurse) :
	SimplifyBinOp(Opcode, LHS, Incoming, Q, MaxRecurse);
	// If the operation failed to simplify, or simplified to a different value
	// to previously, then give up.
	if (!V \|\| (CommonValue && V != CommonValue))
	return nullptr;
	CommonValue = V;
	}

	return CommonValue;
	}

	/// In the case of a comparison with a PHI instruction, try to simplify the
	/// comparison by seeing whether comparing with all of the incoming phi values
	/// yields the same result every time. If so returns the common result,
	/// otherwise returns null.
	static Value ThreadCmpOverPHI(CmpInst::Predicate Pred, Value LHS, Value *RHS,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	// Recursion is always used, so bail out at once if we already hit the limit.
	if (!MaxRecurse--)
	return nullptr;

	// Make sure the phi is on the LHS.
	if (!isa<PHINode>(LHS)) {
	std::swap(LHS, RHS);
	Pred = CmpInst::getSwappedPredicate(Pred);
	}
	assert(isa<PHINode>(LHS) && "Not comparing with a phi instruction!");
	PHINode *PI = cast<PHINode>(LHS);

	// Bail out if RHS and the phi may be mutually interdependent due to a loop.
	if (!valueDominatesPHI(RHS, PI, Q.DT))
	return nullptr;

	// Evaluate the BinOp on the incoming phi values.
	Value *CommonValue = nullptr;
	for (Value *Incoming : PI->incoming_values()) {
	// If the incoming value is the phi node itself, it can safely be skipped.
	if (Incoming == PI) continue;
	Value *V = SimplifyCmpInst(Pred, Incoming, RHS, Q, MaxRecurse);
	// If the operation failed to simplify, or simplified to a different value
	// to previously, then give up.
	if (!V \|\| (CommonValue && V != CommonValue))
	return nullptr;
	CommonValue = V;
	}

	return CommonValue;
	}

	static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
	Value &Op0, Value &Op1,
	const SimplifyQuery &Q) {
	if (auto *CLHS = dyn_cast<Constant>(Op0)) {
	if (auto *CRHS = dyn_cast<Constant>(Op1))
	return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);

	// Canonicalize the constant to the RHS if this is a commutative operation.
	if (Instruction::isCommutative(Opcode))
	std::swap(Op0, Op1);
	}
	return nullptr;
	}

	/// Given operands for an Add, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyAddInst(Value Op0, Value *Op1, bool IsNSW, bool IsNUW,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
	return C;

	// X + undef -> undef
	if (match(Op1, m_Undef()))
	return Op1;

	// X + 0 -> X
	if (match(Op1, m_Zero()))
	return Op0;

	// If two operands are negative, return 0.
	if (isKnownNegation(Op0, Op1))
	return Constant::getNullValue(Op0->getType());

	// X + (Y - X) -> Y
	// (Y - X) + X -> Y
	// Eg: X + -X -> 0
	Value *Y = nullptr;
	if (match(Op1, m_Sub(m_Value(Y), m_Specific(Op0))) \|\|
	match(Op0, m_Sub(m_Value(Y), m_Specific(Op1))))
	return Y;

	// X + ~X -> -1 since ~X = -X-1
	Type *Ty = Op0->getType();
	if (match(Op0, m_Not(m_Specific(Op1))) \|\|
	match(Op1, m_Not(m_Specific(Op0))))
	return Constant::getAllOnesValue(Ty);

	// add nsw/nuw (xor Y, signmask), signmask --> Y
	// The no-wrapping add guarantees that the top bit will be set by the add.
	// Therefore, the xor must be clearing the already set sign bit of Y.
	if ((IsNSW \|\| IsNUW) && match(Op1, m_SignMask()) &&
	match(Op0, m_Xor(m_Value(Y), m_SignMask())))
	return Y;

	// add nuw %x, -1 -> -1, because %x can only be 0.
	if (IsNUW && match(Op1, m_AllOnes()))
	return Op1; // Which is -1.

	/// i1 add -> xor.
	if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
	if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
	return V;

	// Try some generic simplifications for associative operations.
	if (Value *V = SimplifyAssociativeBinOp(Instruction::Add, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// Threading Add over selects and phi nodes is pointless, so don't bother.
	// Threading over the select in "A + select(cond, B, C)" means evaluating
	// "A+B" and "A+C" and seeing if they are equal; but they are equal if and
	// only if B and C are equal. If B and C are equal then (since we assume
	// that operands have already been simplified) "select(cond, B, C)" should
	// have been simplified to the common value of B and C already. Analysing
	// "A+B" and "A+C" thus gains nothing, but costs compile time. Similarly
	// for threading over phi nodes.

	return nullptr;
	}

	Value llvm::SimplifyAddInst(Value Op0, Value *Op1, bool IsNSW, bool IsNUW,
	const SimplifyQuery &Query) {
	return ::SimplifyAddInst(Op0, Op1, IsNSW, IsNUW, Query, RecursionLimit);
	}

	/// Compute the base pointer and cumulative constant offsets for V.
	///
	/// This strips all constant offsets off of V, leaving it the base pointer, and
	/// accumulates the total constant offset applied in the returned constant. It
	/// returns 0 if V is not a pointer, and returns the constant '0' if there are
	/// no constant offsets applied.
	///
	/// This is very similar to GetPointerBaseWithConstantOffset except it doesn't
	/// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
	/// folding.
	static Constant stripAndComputeConstantOffsets(const DataLayout &DL, Value &V,
	bool AllowNonInbounds = false) {
	assert(V->getType()->isPtrOrPtrVectorTy());

	Type *IntPtrTy = DL.getIntPtrType(V->getType())->getScalarType();
	APInt Offset = APInt::getNullValue(IntPtrTy->getIntegerBitWidth());

	V = V->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds);
	// As that strip may trace through `addrspacecast`, need to sext or trunc
	// the offset calculated.
	IntPtrTy = DL.getIntPtrType(V->getType())->getScalarType();
	Offset = Offset.sextOrTrunc(IntPtrTy->getIntegerBitWidth());

	Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset);
	if (V->getType()->isVectorTy())
	return ConstantVector::getSplat(V->getType()->getVectorNumElements(),
	OffsetIntPtr);
	return OffsetIntPtr;
	}

	/// Compute the constant difference between two pointer values.
	/// If the difference is not a constant, returns zero.
	static Constant computePointerDifference(const DataLayout &DL, Value LHS,
	Value *RHS) {
	Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
	Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);

	// If LHS and RHS are not related via constant offsets to the same base
	// value, there is nothing we can do here.
	if (LHS != RHS)
	return nullptr;

	// Otherwise, the difference of LHS - RHS can be computed as:
	// LHS - RHS
	// = (LHSOffset + Base) - (RHSOffset + Base)
	// = LHSOffset - RHSOffset
	return ConstantExpr::getSub(LHSOffset, RHSOffset);
	}

	/// Given operands for a Sub, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifySubInst(Value Op0, Value *Op1, bool isNSW, bool isNUW,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q))
	return C;

	// X - undef -> undef
	// undef - X -> undef
	if (match(Op0, m_Undef()) \|\| match(Op1, m_Undef()))
	return UndefValue::get(Op0->getType());

	// X - 0 -> X
	if (match(Op1, m_Zero()))
	return Op0;

	// X - X -> 0
	if (Op0 == Op1)
	return Constant::getNullValue(Op0->getType());

	// Is this a negation?
	if (match(Op0, m_Zero())) {
	// 0 - X -> 0 if the sub is NUW.
	if (isNUW)
	return Constant::getNullValue(Op0->getType());

	KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (Known.Zero.isMaxSignedValue()) {
	// Op1 is either 0 or the minimum signed value. If the sub is NSW, then
	// Op1 must be 0 because negating the minimum signed value is undefined.
	if (isNSW)
	return Constant::getNullValue(Op0->getType());

	// 0 - X -> X if X is 0 or the minimum signed value.
	return Op1;
	}
	}

	// (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies.
	// For example, (X + Y) - Y -> X; (Y + X) - Y -> X
	Value X = nullptr, Y = nullptr, *Z = Op1;
	if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z
	// See if "V === Y - Z" simplifies.
	if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1))
	// It does! Now see if "X + V" simplifies.
	if (Value *W = SimplifyBinOp(Instruction::Add, X, V, Q, MaxRecurse-1)) {
	// It does, we successfully reassociated!
	++NumReassoc;
	return W;
	}
	// See if "V === X - Z" simplifies.
	if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
	// It does! Now see if "Y + V" simplifies.
	if (Value *W = SimplifyBinOp(Instruction::Add, Y, V, Q, MaxRecurse-1)) {
	// It does, we successfully reassociated!
	++NumReassoc;
	return W;
	}
	}

	// X - (Y + Z) -> (X - Y) - Z or (X - Z) - Y if everything simplifies.
	// For example, X - (X + 1) -> -1
	X = Op0;
	if (MaxRecurse && match(Op1, m_Add(m_Value(Y), m_Value(Z)))) { // X - (Y + Z)
	// See if "V === X - Y" simplifies.
	if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
	// It does! Now see if "V - Z" simplifies.
	if (Value *W = SimplifyBinOp(Instruction::Sub, V, Z, Q, MaxRecurse-1)) {
	// It does, we successfully reassociated!
	++NumReassoc;
	return W;
	}
	// See if "V === X - Z" simplifies.
	if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
	// It does! Now see if "V - Y" simplifies.
	if (Value *W = SimplifyBinOp(Instruction::Sub, V, Y, Q, MaxRecurse-1)) {
	// It does, we successfully reassociated!
	++NumReassoc;
	return W;
	}
	}

	// Z - (X - Y) -> (Z - X) + Y if everything simplifies.
	// For example, X - (X - Y) -> Y.
	Z = Op0;
	if (MaxRecurse && match(Op1, m_Sub(m_Value(X), m_Value(Y)))) // Z - (X - Y)
	// See if "V === Z - X" simplifies.
	if (Value *V = SimplifyBinOp(Instruction::Sub, Z, X, Q, MaxRecurse-1))
	// It does! Now see if "V + Y" simplifies.
	if (Value *W = SimplifyBinOp(Instruction::Add, V, Y, Q, MaxRecurse-1)) {
	// It does, we successfully reassociated!
	++NumReassoc;
	return W;
	}

	// trunc(X) - trunc(Y) -> trunc(X - Y) if everything simplifies.
	if (MaxRecurse && match(Op0, m_Trunc(m_Value(X))) &&
	match(Op1, m_Trunc(m_Value(Y))))
	if (X->getType() == Y->getType())
	// See if "V === X - Y" simplifies.
	if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
	// It does! Now see if "trunc V" simplifies.
	if (Value *W = SimplifyCastInst(Instruction::Trunc, V, Op0->getType(),
	Q, MaxRecurse - 1))
	// It does, return the simplified "trunc V".
	return W;

	// Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
	if (match(Op0, m_PtrToInt(m_Value(X))) &&
	match(Op1, m_PtrToInt(m_Value(Y))))
	if (Constant *Result = computePointerDifference(Q.DL, X, Y))
	return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);

	// i1 sub -> xor.
	if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
	if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
	return V;

	// Threading Sub over selects and phi nodes is pointless, so don't bother.
	// Threading over the select in "A - select(cond, B, C)" means evaluating
	// "A-B" and "A-C" and seeing if they are equal; but they are equal if and
	// only if B and C are equal. If B and C are equal then (since we assume
	// that operands have already been simplified) "select(cond, B, C)" should
	// have been simplified to the common value of B and C already. Analysing
	// "A-B" and "A-C" thus gains nothing, but costs compile time. Similarly
	// for threading over phi nodes.

	return nullptr;
	}

	Value llvm::SimplifySubInst(Value Op0, Value *Op1, bool isNSW, bool isNUW,
	const SimplifyQuery &Q) {
	return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
	}

	/// Given operands for a Mul, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyMulInst(Value Op0, Value *Op1, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q))
	return C;

	// X * undef -> 0
	// X * 0 -> 0
	if (match(Op1, m_CombineOr(m_Undef(), m_Zero())))
	return Constant::getNullValue(Op0->getType());

	// X * 1 -> X
	if (match(Op1, m_One()))
	return Op0;

	// (X / Y) * Y -> X if the division is exact.
	Value *X = nullptr;
	if (Q.IIQ.UseInstrInfo &&
	(match(Op0,
	m_Exact(m_IDiv(m_Value(X), m_Specific(Op1)))) \|\| // (X / Y) * Y
	match(Op1, m_Exact(m_IDiv(m_Value(X), m_Specific(Op0)))))) // Y * (X / Y)
	return X;

	// i1 mul -> and.
	if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
	if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
	return V;

	// Try some generic simplifications for associative operations.
	if (Value *V = SimplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// Mul distributes over Add. Try some generic simplifications based on this.
	if (Value *V = ExpandBinOp(Instruction::Mul, Op0, Op1, Instruction::Add,
	Q, MaxRecurse))
	return V;

	// If the operation is with the result of a select instruction, check whether
	// operating on either branch of the select always yields the same value.
	if (isa<SelectInst>(Op0) \|\| isa<SelectInst>(Op1))
	if (Value *V = ThreadBinOpOverSelect(Instruction::Mul, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// If the operation is with the result of a phi instruction, check whether
	// operating on all incoming values of the phi always yields the same value.
	if (isa<PHINode>(Op0) \|\| isa<PHINode>(Op1))
	if (Value *V = ThreadBinOpOverPHI(Instruction::Mul, Op0, Op1, Q,
	MaxRecurse))
	return V;

	return nullptr;
	}

	Value llvm::SimplifyMulInst(Value Op0, Value *Op1, const SimplifyQuery &Q) {
	return ::SimplifyMulInst(Op0, Op1, Q, RecursionLimit);
	}

	/// Check for common or similar folds of integer division or integer remainder.
	/// This applies to all 4 opcodes (sdiv/udiv/srem/urem).
	static Value simplifyDivRem(Value Op0, Value *Op1, bool IsDiv) {
	Type *Ty = Op0->getType();

	// X / undef -> undef
	// X % undef -> undef
	if (match(Op1, m_Undef()))
	return Op1;

	// X / 0 -> undef
	// X % 0 -> undef
	// We don't need to preserve faults!
	if (match(Op1, m_Zero()))
	return UndefValue::get(Ty);

	// If any element of a constant divisor vector is zero or undef, the whole op
	// is undef.
	auto *Op1C = dyn_cast<Constant>(Op1);
	if (Op1C && Ty->isVectorTy()) {
	unsigned NumElts = Ty->getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i) {
	Constant *Elt = Op1C->getAggregateElement(i);
	if (Elt && (Elt->isNullValue() \|\| isa<UndefValue>(Elt)))
	return UndefValue::get(Ty);
	}
	}

	// undef / X -> 0
	// undef % X -> 0
	if (match(Op0, m_Undef()))
	return Constant::getNullValue(Ty);

	// 0 / X -> 0
	// 0 % X -> 0
	if (match(Op0, m_Zero()))
	return Constant::getNullValue(Op0->getType());

	// X / X -> 1
	// X % X -> 0
	if (Op0 == Op1)
	return IsDiv ? ConstantInt::get(Ty, 1) : Constant::getNullValue(Ty);

	// X / 1 -> X
	// X % 1 -> 0
	// If this is a boolean op (single-bit element type), we can't have
	// division-by-zero or remainder-by-zero, so assume the divisor is 1.
	// Similarly, if we're zero-extending a boolean divisor, then assume it's a 1.
	Value *X;
	if (match(Op1, m_One()) \|\| Ty->isIntOrIntVectorTy(1) \|\|
	(match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
	return IsDiv ? Op0 : Constant::getNullValue(Ty);

	return nullptr;
	}

	/// Given a predicate and two operands, return true if the comparison is true.
	/// This is a helper for div/rem simplification where we return some other value
	/// when we can prove a relationship between the operands.
	static bool isICmpTrue(ICmpInst::Predicate Pred, Value LHS, Value RHS,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	Value *V = SimplifyICmpInst(Pred, LHS, RHS, Q, MaxRecurse);
	Constant *C = dyn_cast_or_null<Constant>(V);
	return (C && C->isAllOnesValue());
	}

	/// Return true if we can simplify X / Y to 0. Remainder can adapt that answer
	/// to simplify X % Y to X.
	static bool isDivZero(Value X, Value Y, const SimplifyQuery &Q,
	unsigned MaxRecurse, bool IsSigned) {
	// Recursion is always used, so bail out at once if we already hit the limit.
	if (!MaxRecurse--)
	return false;

	if (IsSigned) {
	// \|X\| / \|Y\| --> 0
	//
	// We require that 1 operand is a simple constant. That could be extended to
	// 2 variables if we computed the sign bit for each.
	//
	// Make sure that a constant is not the minimum signed value because taking
	// the abs() of that is undefined.
	Type *Ty = X->getType();
	const APInt *C;
	if (match(X, m_APInt(C)) && !C->isMinSignedValue()) {
	// Is the variable divisor magnitude always greater than the constant
	// dividend magnitude?
	// \|Y\| > \|C\| --> Y < -abs(C) or Y > abs(C)
	Constant *PosDividendC = ConstantInt::get(Ty, C->abs());
	Constant *NegDividendC = ConstantInt::get(Ty, -C->abs());
	if (isICmpTrue(CmpInst::ICMP_SLT, Y, NegDividendC, Q, MaxRecurse) \|\|
	isICmpTrue(CmpInst::ICMP_SGT, Y, PosDividendC, Q, MaxRecurse))
	return true;
	}
	if (match(Y, m_APInt(C))) {
	// Special-case: we can't take the abs() of a minimum signed value. If
	// that's the divisor, then all we have to do is prove that the dividend
	// is also not the minimum signed value.
	if (C->isMinSignedValue())
	return isICmpTrue(CmpInst::ICMP_NE, X, Y, Q, MaxRecurse);

	// Is the variable dividend magnitude always less than the constant
	// divisor magnitude?
	// \|X\| < \|C\| --> X > -abs(C) and X < abs(C)
	Constant *PosDivisorC = ConstantInt::get(Ty, C->abs());
	Constant *NegDivisorC = ConstantInt::get(Ty, -C->abs());
	if (isICmpTrue(CmpInst::ICMP_SGT, X, NegDivisorC, Q, MaxRecurse) &&
	isICmpTrue(CmpInst::ICMP_SLT, X, PosDivisorC, Q, MaxRecurse))
	return true;
	}
	return false;
	}

	// IsSigned == false.
	// Is the dividend unsigned less than the divisor?
	return isICmpTrue(ICmpInst::ICMP_ULT, X, Y, Q, MaxRecurse);
	}

	/// These are simplifications common to SDiv and UDiv.
	static Value simplifyDiv(Instruction::BinaryOps Opcode, Value Op0, Value *Op1,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
	return C;

	if (Value *V = simplifyDivRem(Op0, Op1, true))
	return V;

	bool IsSigned = Opcode == Instruction::SDiv;

	// (X * Y) / Y -> X if the multiplication does not overflow.
	Value *X;
	if (match(Op0, m_c_Mul(m_Value(X), m_Specific(Op1)))) {
	auto *Mul = cast<OverflowingBinaryOperator>(Op0);
	// If the Mul does not overflow, then we are good to go.
	if ((IsSigned && Q.IIQ.hasNoSignedWrap(Mul)) \|\|
	(!IsSigned && Q.IIQ.hasNoUnsignedWrap(Mul)))
	return X;
	// If X has the form X = A / Y, then X * Y cannot overflow.
	if ((IsSigned && match(X, m_SDiv(m_Value(), m_Specific(Op1)))) \|\|
	(!IsSigned && match(X, m_UDiv(m_Value(), m_Specific(Op1)))))
	return X;
	}

	// (X rem Y) / Y -> 0
	if ((IsSigned && match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) \|\|
	(!IsSigned && match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
	return Constant::getNullValue(Op0->getType());

	// (X /u C1) /u C2 -> 0 if C1 * C2 overflow
	ConstantInt C1, C2;
	if (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_ConstantInt(C1))) &&
	match(Op1, m_ConstantInt(C2))) {
	bool Overflow;
	(void)C1->getValue().umul_ov(C2->getValue(), Overflow);
	if (Overflow)
	return Constant::getNullValue(Op0->getType());
	}

	// If the operation is with the result of a select instruction, check whether
	// operating on either branch of the select always yields the same value.
	if (isa<SelectInst>(Op0) \|\| isa<SelectInst>(Op1))
	if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
	return V;

	// If the operation is with the result of a phi instruction, check whether
	// operating on all incoming values of the phi always yields the same value.
	if (isa<PHINode>(Op0) \|\| isa<PHINode>(Op1))
	if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
	return V;

	if (isDivZero(Op0, Op1, Q, MaxRecurse, IsSigned))
	return Constant::getNullValue(Op0->getType());

	return nullptr;
	}

	/// These are simplifications common to SRem and URem.
	static Value simplifyRem(Instruction::BinaryOps Opcode, Value Op0, Value *Op1,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
	return C;

	if (Value *V = simplifyDivRem(Op0, Op1, false))
	return V;

	// (X % Y) % Y -> X % Y
	if ((Opcode == Instruction::SRem &&
	match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) \|\|
	(Opcode == Instruction::URem &&
	match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
	return Op0;

	// (X << Y) % X -> 0
	if (Q.IIQ.UseInstrInfo &&
	((Opcode == Instruction::SRem &&
	match(Op0, m_NSWShl(m_Specific(Op1), m_Value()))) \|\|
	(Opcode == Instruction::URem &&
	match(Op0, m_NUWShl(m_Specific(Op1), m_Value())))))
	return Constant::getNullValue(Op0->getType());

	// If the operation is with the result of a select instruction, check whether
	// operating on either branch of the select always yields the same value.
	if (isa<SelectInst>(Op0) \|\| isa<SelectInst>(Op1))
	if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
	return V;

	// If the operation is with the result of a phi instruction, check whether
	// operating on all incoming values of the phi always yields the same value.
	if (isa<PHINode>(Op0) \|\| isa<PHINode>(Op1))
	if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
	return V;

	// If X / Y == 0, then X % Y == X.
	if (isDivZero(Op0, Op1, Q, MaxRecurse, Opcode == Instruction::SRem))
	return Op0;

	return nullptr;
	}

	/// Given operands for an SDiv, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifySDivInst(Value Op0, Value *Op1, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	// If two operands are negated and no signed overflow, return -1.
	if (isKnownNegation(Op0, Op1, /NeedNSW=/true))
	return Constant::getAllOnesValue(Op0->getType());

	return simplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse);
	}

	Value llvm::SimplifySDivInst(Value Op0, Value *Op1, const SimplifyQuery &Q) {
	return ::SimplifySDivInst(Op0, Op1, Q, RecursionLimit);
	}

	/// Given operands for a UDiv, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyUDivInst(Value Op0, Value *Op1, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	return simplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse);
	}

	Value llvm::SimplifyUDivInst(Value Op0, Value *Op1, const SimplifyQuery &Q) {
	return ::SimplifyUDivInst(Op0, Op1, Q, RecursionLimit);
	}

	/// Given operands for an SRem, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifySRemInst(Value Op0, Value *Op1, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	// If the divisor is 0, the result is undefined, so assume the divisor is -1.
	// srem Op0, (sext i1 X) --> srem Op0, -1 --> 0
	Value *X;
	if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
	return ConstantInt::getNullValue(Op0->getType());

	// If the two operands are negated, return 0.
	if (isKnownNegation(Op0, Op1))
	return ConstantInt::getNullValue(Op0->getType());

	return simplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse);
	}

	Value llvm::SimplifySRemInst(Value Op0, Value *Op1, const SimplifyQuery &Q) {
	return ::SimplifySRemInst(Op0, Op1, Q, RecursionLimit);
	}

	/// Given operands for a URem, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyURemInst(Value Op0, Value *Op1, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	return simplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse);
	}

	Value llvm::SimplifyURemInst(Value Op0, Value *Op1, const SimplifyQuery &Q) {
	return ::SimplifyURemInst(Op0, Op1, Q, RecursionLimit);
	}

	/// Returns true if a shift by \c Amount always yields undef.
	static bool isUndefShift(Value *Amount) {
	Constant *C = dyn_cast<Constant>(Amount);
	if (!C)
	return false;

	// X shift by undef -> undef because it may shift by the bitwidth.
	if (isa<UndefValue>(C))
	return true;

	// Shifting by the bitwidth or more is undefined.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
	if (CI->getValue().getLimitedValue() >=
	CI->getType()->getScalarSizeInBits())
	return true;

	// If all lanes of a vector shift are undefined the whole shift is.
	if (isa<ConstantVector>(C) \|\| isa<ConstantDataVector>(C)) {
	for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; ++I)
	if (!isUndefShift(C->getAggregateElement(I)))
	return false;
	return true;
	}

	return false;
	}

	/// Given operands for an Shl, LShr or AShr, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyShift(Instruction::BinaryOps Opcode, Value Op0,
	Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
	return C;

	// 0 shift by X -> 0
	if (match(Op0, m_Zero()))
	return Constant::getNullValue(Op0->getType());

	// X shift by 0 -> X
	// Shift-by-sign-extended bool must be shift-by-0 because shift-by-all-ones
	// would be poison.
	Value *X;
	if (match(Op1, m_Zero()) \|\|
	(match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
	return Op0;

	// Fold undefined shifts.
	if (isUndefShift(Op1))
	return UndefValue::get(Op0->getType());

	// If the operation is with the result of a select instruction, check whether
	// operating on either branch of the select always yields the same value.
	if (isa<SelectInst>(Op0) \|\| isa<SelectInst>(Op1))
	if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
	return V;

	// If the operation is with the result of a phi instruction, check whether
	// operating on all incoming values of the phi always yields the same value.
	if (isa<PHINode>(Op0) \|\| isa<PHINode>(Op1))
	if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
	return V;

	// If any bits in the shift amount make that value greater than or equal to
	// the number of bits in the type, the shift is undefined.
	KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (Known.One.getLimitedValue() >= Known.getBitWidth())
	return UndefValue::get(Op0->getType());

	// If all valid bits in the shift amount are known zero, the first operand is
	// unchanged.
	unsigned NumValidShiftBits = Log2_32_Ceil(Known.getBitWidth());
	if (Known.countMinTrailingZeros() >= NumValidShiftBits)
	return Op0;

	return nullptr;
	}

	/// Given operands for an Shl, LShr or AShr, see if we can
	/// fold the result. If not, this returns null.
	static Value SimplifyRightShift(Instruction::BinaryOps Opcode, Value Op0,
	Value *Op1, bool isExact, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse))
	return V;

	// X >> X -> 0
	if (Op0 == Op1)
	return Constant::getNullValue(Op0->getType());

	// undef >> X -> 0
	// undef >> X -> undef (if it's exact)
	if (match(Op0, m_Undef()))
	return isExact ? Op0 : Constant::getNullValue(Op0->getType());

	// The low bit cannot be shifted out of an exact shift if it is set.
	if (isExact) {
	KnownBits Op0Known = computeKnownBits(Op0, Q.DL, /Depth=/0, Q.AC, Q.CxtI, Q.DT);
	if (Op0Known.One[0])
	return Op0;
	}

	return nullptr;
	}

	/// Given operands for an Shl, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyShlInst(Value Op0, Value *Op1, bool isNSW, bool isNUW,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse))
	return V;

	// undef << X -> 0
	// undef << X -> undef if (if it's NSW/NUW)
	if (match(Op0, m_Undef()))
	return isNSW \|\| isNUW ? Op0 : Constant::getNullValue(Op0->getType());

	// (X >> A) << A -> X
	Value *X;
	if (Q.IIQ.UseInstrInfo &&
	match(Op0, m_Exact(m_Shr(m_Value(X), m_Specific(Op1)))))
	return X;

	// shl nuw i8 C, %x -> C iff C has sign bit set.
	if (isNUW && match(Op0, m_Negative()))
	return Op0;
	// NOTE: could use computeKnownBits() / LazyValueInfo,
	// but the cost-benefit analysis suggests it isn't worth it.

	return nullptr;
	}

	Value llvm::SimplifyShlInst(Value Op0, Value *Op1, bool isNSW, bool isNUW,
	const SimplifyQuery &Q) {
	return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
	}

	/// Given operands for an LShr, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyLShrInst(Value Op0, Value *Op1, bool isExact,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q,
	MaxRecurse))
	return V;

	// (X << A) >> A -> X
	Value *X;
	if (match(Op0, m_NUWShl(m_Value(X), m_Specific(Op1))))
	return X;

	// ((X << A) \| Y) >> A -> X if effective width of Y is not larger than A.
	// We can return X as we do in the above case since OR alters no bits in X.
	// SimplifyDemandedBits in InstCombine can do more general optimization for
	// bit manipulation. This pattern aims to provide opportunities for other
	// optimizers by supporting a simple but common case in InstSimplify.
	Value *Y;
	const APInt ShRAmt, ShLAmt;
	if (match(Op1, m_APInt(ShRAmt)) &&
	match(Op0, m_c_Or(m_NUWShl(m_Value(X), m_APInt(ShLAmt)), m_Value(Y))) &&
	ShRAmt == ShLAmt) {
	const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	const unsigned Width = Op0->getType()->getScalarSizeInBits();
	const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
	if (ShRAmt->uge(EffWidthY))
	return X;
	}

	return nullptr;
	}

	Value llvm::SimplifyLShrInst(Value Op0, Value *Op1, bool isExact,
	const SimplifyQuery &Q) {
	return ::SimplifyLShrInst(Op0, Op1, isExact, Q, RecursionLimit);
	}

	/// Given operands for an AShr, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyAShrInst(Value Op0, Value *Op1, bool isExact,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q,
	MaxRecurse))
	return V;

	// all ones >>a X -> -1
	// Do not return Op0 because it may contain undef elements if it's a vector.
	if (match(Op0, m_AllOnes()))
	return Constant::getAllOnesValue(Op0->getType());

	// (X << A) >> A -> X
	Value *X;
	if (Q.IIQ.UseInstrInfo && match(Op0, m_NSWShl(m_Value(X), m_Specific(Op1))))
	return X;

	// Arithmetic shifting an all-sign-bit value is a no-op.
	unsigned NumSignBits = ComputeNumSignBits(Op0, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (NumSignBits == Op0->getType()->getScalarSizeInBits())
	return Op0;

	return nullptr;
	}

	Value llvm::SimplifyAShrInst(Value Op0, Value *Op1, bool isExact,
	const SimplifyQuery &Q) {
	return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit);
	}

	/// Commuted variants are assumed to be handled by calling this function again
	/// with the parameters swapped.
	static Value simplifyUnsignedRangeCheck(ICmpInst ZeroICmp,
	ICmpInst *UnsignedICmp, bool IsAnd) {
	Value X, Y;

	ICmpInst::Predicate EqPred;
	if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(Y), m_Zero())) \|\|
	!ICmpInst::isEquality(EqPred))
	return nullptr;

	ICmpInst::Predicate UnsignedPred;
	if (match(UnsignedICmp, m_ICmp(UnsignedPred, m_Value(X), m_Specific(Y))) &&
	ICmpInst::isUnsigned(UnsignedPred))
	;
	else if (match(UnsignedICmp,
	m_ICmp(UnsignedPred, m_Specific(Y), m_Value(X))) &&
	ICmpInst::isUnsigned(UnsignedPred))
	UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
	else
	return nullptr;

	// X < Y && Y != 0 --> X < Y
	// X < Y \|\| Y != 0 --> Y != 0
	if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE)
	return IsAnd ? UnsignedICmp : ZeroICmp;

	// X >= Y \|\| Y != 0 --> true
	// X >= Y \|\| Y == 0 --> X >= Y
	if (UnsignedPred == ICmpInst::ICMP_UGE && !IsAnd) {
	if (EqPred == ICmpInst::ICMP_NE)
	return getTrue(UnsignedICmp->getType());
	return UnsignedICmp;
	}

	// X < Y && Y == 0 --> false
	if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_EQ &&
	IsAnd)
	return getFalse(UnsignedICmp->getType());

	return nullptr;
	}

	/// Commuted variants are assumed to be handled by calling this function again
	/// with the parameters swapped.
	static Value simplifyAndOfICmpsWithSameOperands(ICmpInst Op0, ICmpInst *Op1) {
	ICmpInst::Predicate Pred0, Pred1;
	Value A ,B;
	if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) \|\|
	!match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
	return nullptr;

	// We have (icmp Pred0, A, B) & (icmp Pred1, A, B).
	// If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
	// can eliminate Op1 from this 'and'.
	if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
	return Op0;

	// Check for any combination of predicates that are guaranteed to be disjoint.
	if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) \|\|
	(Pred0 == ICmpInst::ICMP_EQ && ICmpInst::isFalseWhenEqual(Pred1)) \|\|
	(Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT) \|\|
	(Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT))
	return getFalse(Op0->getType());

	return nullptr;
	}

	/// Commuted variants are assumed to be handled by calling this function again
	/// with the parameters swapped.
	static Value simplifyOrOfICmpsWithSameOperands(ICmpInst Op0, ICmpInst *Op1) {
	ICmpInst::Predicate Pred0, Pred1;
	Value A ,B;
	if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) \|\|
	!match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
	return nullptr;

	// We have (icmp Pred0, A, B) \| (icmp Pred1, A, B).
	// If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
	// can eliminate Op0 from this 'or'.
	if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
	return Op1;

	// Check for any combination of predicates that cover the entire range of
	// possibilities.
	if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) \|\|
	(Pred0 == ICmpInst::ICMP_NE && ICmpInst::isTrueWhenEqual(Pred1)) \|\|
	(Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGE) \|\|
	(Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGE))
	return getTrue(Op0->getType());

	return nullptr;
	}

	/// Test if a pair of compares with a shared operand and 2 constants has an
	/// empty set intersection, full set union, or if one compare is a superset of
	/// the other.
	static Value simplifyAndOrOfICmpsWithConstants(ICmpInst Cmp0, ICmpInst *Cmp1,
	bool IsAnd) {
	// Look for this pattern: {and/or} (icmp X, C0), (icmp X, C1)).
	if (Cmp0->getOperand(0) != Cmp1->getOperand(0))
	return nullptr;

	const APInt C0, C1;
	if (!match(Cmp0->getOperand(1), m_APInt(C0)) \|\|
	!match(Cmp1->getOperand(1), m_APInt(C1)))
	return nullptr;

	auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0);
	auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1);

	// For and-of-compares, check if the intersection is empty:
	// (icmp X, C0) && (icmp X, C1) --> empty set --> false
	if (IsAnd && Range0.intersectWith(Range1).isEmptySet())
	return getFalse(Cmp0->getType());

	// For or-of-compares, check if the union is full:
	// (icmp X, C0) \|\| (icmp X, C1) --> full set --> true
	if (!IsAnd && Range0.unionWith(Range1).isFullSet())
	return getTrue(Cmp0->getType());

	// Is one range a superset of the other?
	// If this is and-of-compares, take the smaller set:
	// (icmp sgt X, 4) && (icmp sgt X, 42) --> icmp sgt X, 42
	// If this is or-of-compares, take the larger set:
	// (icmp sgt X, 4) \|\| (icmp sgt X, 42) --> icmp sgt X, 4
	if (Range0.contains(Range1))
	return IsAnd ? Cmp1 : Cmp0;
	if (Range1.contains(Range0))
	return IsAnd ? Cmp0 : Cmp1;

	return nullptr;
	}

	static Value simplifyAndOrOfICmpsWithZero(ICmpInst Cmp0, ICmpInst *Cmp1,
	bool IsAnd) {
	ICmpInst::Predicate P0 = Cmp0->getPredicate(), P1 = Cmp1->getPredicate();
	if (!match(Cmp0->getOperand(1), m_Zero()) \|\|
	!match(Cmp1->getOperand(1), m_Zero()) \|\| P0 != P1)
	return nullptr;

	if ((IsAnd && P0 != ICmpInst::ICMP_NE) \|\| (!IsAnd && P1 != ICmpInst::ICMP_EQ))
	return nullptr;

	// We have either "(X == 0 \|\| Y == 0)" or "(X != 0 && Y != 0)".
	Value *X = Cmp0->getOperand(0);
	Value *Y = Cmp1->getOperand(0);

	// If one of the compares is a masked version of a (not) null check, then
	// that compare implies the other, so we eliminate the other. Optionally, look
	// through a pointer-to-int cast to match a null check of a pointer type.

	// (X == 0) \|\| (([ptrtoint] X & ?) == 0) --> ([ptrtoint] X & ?) == 0
	// (X == 0) \|\| ((? & [ptrtoint] X) == 0) --> (? & [ptrtoint] X) == 0
	// (X != 0) && (([ptrtoint] X & ?) != 0) --> ([ptrtoint] X & ?) != 0
	// (X != 0) && ((? & [ptrtoint] X) != 0) --> (? & [ptrtoint] X) != 0
	if (match(Y, m_c_And(m_Specific(X), m_Value())) \|\|
	match(Y, m_c_And(m_PtrToInt(m_Specific(X)), m_Value())))
	return Cmp1;

	// (([ptrtoint] Y & ?) == 0) \|\| (Y == 0) --> ([ptrtoint] Y & ?) == 0
	// ((? & [ptrtoint] Y) == 0) \|\| (Y == 0) --> (? & [ptrtoint] Y) == 0
	// (([ptrtoint] Y & ?) != 0) && (Y != 0) --> ([ptrtoint] Y & ?) != 0
	// ((? & [ptrtoint] Y) != 0) && (Y != 0) --> (? & [ptrtoint] Y) != 0
	if (match(X, m_c_And(m_Specific(Y), m_Value())) \|\|
	match(X, m_c_And(m_PtrToInt(m_Specific(Y)), m_Value())))
	return Cmp0;

	return nullptr;
	}

	static Value simplifyAndOfICmpsWithAdd(ICmpInst Op0, ICmpInst *Op1,
	const InstrInfoQuery &IIQ) {
	// (icmp (add V, C0), C1) & (icmp V, C0)
	ICmpInst::Predicate Pred0, Pred1;
	const APInt C0, C1;
	Value *V;
	if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
	return nullptr;

	if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value())))
	return nullptr;

	auto *AddInst = cast<OverflowingBinaryOperator>(Op0->getOperand(0));
	if (AddInst->getOperand(1) != Op1->getOperand(1))
	return nullptr;

	Type *ITy = Op0->getType();
	bool isNSW = IIQ.hasNoSignedWrap(AddInst);
	bool isNUW = IIQ.hasNoUnsignedWrap(AddInst);

	const APInt Delta = C1 - C0;
	if (C0->isStrictlyPositive()) {
	if (Delta == 2) {
	if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_SGT)
	return getFalse(ITy);
	if (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT && isNSW)
	return getFalse(ITy);
	}
	if (Delta == 1) {
	if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_SGT)
	return getFalse(ITy);
	if (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGT && isNSW)
	return getFalse(ITy);
	}
	}
	if (C0->getBoolValue() && isNUW) {
	if (Delta == 2)
	if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT)
	return getFalse(ITy);
	if (Delta == 1)
	if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGT)
	return getFalse(ITy);
	}

	return nullptr;
	}

	static Value simplifyAndOfICmps(ICmpInst Op0, ICmpInst *Op1,
	const InstrInfoQuery &IIQ) {
	if (Value X = simplifyUnsignedRangeCheck(Op0, Op1, /IsAnd=*/true))
	return X;
	if (Value X = simplifyUnsignedRangeCheck(Op1, Op0, /IsAnd=*/true))
	return X;

	if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
	return X;
	if (Value *X = simplifyAndOfICmpsWithSameOperands(Op1, Op0))
	return X;

	if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true))
	return X;

	if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, true))
	return X;

	if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1, IIQ))
	return X;
	if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0, IIQ))
	return X;

	return nullptr;
	}

	static Value simplifyOrOfICmpsWithAdd(ICmpInst Op0, ICmpInst *Op1,
	const InstrInfoQuery &IIQ) {
	// (icmp (add V, C0), C1) \| (icmp V, C0)
	ICmpInst::Predicate Pred0, Pred1;
	const APInt C0, C1;
	Value *V;
	if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
	return nullptr;

	if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value())))
	return nullptr;

	auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
	if (AddInst->getOperand(1) != Op1->getOperand(1))
	return nullptr;

	Type *ITy = Op0->getType();
	bool isNSW = IIQ.hasNoSignedWrap(AddInst);
	bool isNUW = IIQ.hasNoUnsignedWrap(AddInst);

	const APInt Delta = C1 - C0;
	if (C0->isStrictlyPositive()) {
	if (Delta == 2) {
	if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_SLE)
	return getTrue(ITy);
	if (Pred0 == ICmpInst::ICMP_SGE && Pred1 == ICmpInst::ICMP_SLE && isNSW)
	return getTrue(ITy);
	}
	if (Delta == 1) {
	if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_SLE)
	return getTrue(ITy);
	if (Pred0 == ICmpInst::ICMP_SGT && Pred1 == ICmpInst::ICMP_SLE && isNSW)
	return getTrue(ITy);
	}
	}
	if (C0->getBoolValue() && isNUW) {
	if (Delta == 2)
	if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_ULE)
	return getTrue(ITy);
	if (Delta == 1)
	if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_ULE)
	return getTrue(ITy);
	}

	return nullptr;
	}

	static Value simplifyOrOfICmps(ICmpInst Op0, ICmpInst *Op1,
	const InstrInfoQuery &IIQ) {
	if (Value X = simplifyUnsignedRangeCheck(Op0, Op1, /IsAnd=*/false))
	return X;
	if (Value X = simplifyUnsignedRangeCheck(Op1, Op0, /IsAnd=*/false))
	return X;

	if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
	return X;
	if (Value *X = simplifyOrOfICmpsWithSameOperands(Op1, Op0))
	return X;

	if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false))
	return X;

	if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, false))
	return X;

	if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1, IIQ))
	return X;
	if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0, IIQ))
	return X;

	return nullptr;
	}

	static Value simplifyAndOrOfFCmps(const TargetLibraryInfo TLI,
	FCmpInst LHS, FCmpInst RHS, bool IsAnd) {
	Value LHS0 = LHS->getOperand(0), LHS1 = LHS->getOperand(1);
	Value RHS0 = RHS->getOperand(0), RHS1 = RHS->getOperand(1);
	if (LHS0->getType() != RHS0->getType())
	return nullptr;

	FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
	if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) \|\|
	(PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
	// (fcmp ord NNAN, X) & (fcmp ord X, Y) --> fcmp ord X, Y
	// (fcmp ord NNAN, X) & (fcmp ord Y, X) --> fcmp ord Y, X
	// (fcmp ord X, NNAN) & (fcmp ord X, Y) --> fcmp ord X, Y
	// (fcmp ord X, NNAN) & (fcmp ord Y, X) --> fcmp ord Y, X
	// (fcmp uno NNAN, X) \| (fcmp uno X, Y) --> fcmp uno X, Y
	// (fcmp uno NNAN, X) \| (fcmp uno Y, X) --> fcmp uno Y, X
	// (fcmp uno X, NNAN) \| (fcmp uno X, Y) --> fcmp uno X, Y
	// (fcmp uno X, NNAN) \| (fcmp uno Y, X) --> fcmp uno Y, X
	if ((isKnownNeverNaN(LHS0, TLI) && (LHS1 == RHS0 \|\| LHS1 == RHS1)) \|\|
	(isKnownNeverNaN(LHS1, TLI) && (LHS0 == RHS0 \|\| LHS0 == RHS1)))
	return RHS;

	// (fcmp ord X, Y) & (fcmp ord NNAN, X) --> fcmp ord X, Y
	// (fcmp ord Y, X) & (fcmp ord NNAN, X) --> fcmp ord Y, X
	// (fcmp ord X, Y) & (fcmp ord X, NNAN) --> fcmp ord X, Y
	// (fcmp ord Y, X) & (fcmp ord X, NNAN) --> fcmp ord Y, X
	// (fcmp uno X, Y) \| (fcmp uno NNAN, X) --> fcmp uno X, Y
	// (fcmp uno Y, X) \| (fcmp uno NNAN, X) --> fcmp uno Y, X
	// (fcmp uno X, Y) \| (fcmp uno X, NNAN) --> fcmp uno X, Y
	// (fcmp uno Y, X) \| (fcmp uno X, NNAN) --> fcmp uno Y, X
	if ((isKnownNeverNaN(RHS0, TLI) && (RHS1 == LHS0 \|\| RHS1 == LHS1)) \|\|
	(isKnownNeverNaN(RHS1, TLI) && (RHS0 == LHS0 \|\| RHS0 == LHS1)))
	return LHS;
	}

	return nullptr;
	}

	static Value *simplifyAndOrOfCmps(const SimplifyQuery &Q,
	Value Op0, Value Op1, bool IsAnd) {
	// Look through casts of the 'and' operands to find compares.
	auto *Cast0 = dyn_cast<CastInst>(Op0);
	auto *Cast1 = dyn_cast<CastInst>(Op1);
	if (Cast0 && Cast1 && Cast0->getOpcode() == Cast1->getOpcode() &&
	Cast0->getSrcTy() == Cast1->getSrcTy()) {
	Op0 = Cast0->getOperand(0);
	Op1 = Cast1->getOperand(0);
	}

	Value *V = nullptr;
	auto *ICmp0 = dyn_cast<ICmpInst>(Op0);
	auto *ICmp1 = dyn_cast<ICmpInst>(Op1);
	if (ICmp0 && ICmp1)
	V = IsAnd ? simplifyAndOfICmps(ICmp0, ICmp1, Q.IIQ)
	: simplifyOrOfICmps(ICmp0, ICmp1, Q.IIQ);

	auto *FCmp0 = dyn_cast<FCmpInst>(Op0);
	auto *FCmp1 = dyn_cast<FCmpInst>(Op1);
	if (FCmp0 && FCmp1)
	V = simplifyAndOrOfFCmps(Q.TLI, FCmp0, FCmp1, IsAnd);

	if (!V)
	return nullptr;
	if (!Cast0)
	return V;

	// If we looked through casts, we can only handle a constant simplification
	// because we are not allowed to create a cast instruction here.
	if (auto *C = dyn_cast<Constant>(V))
	return ConstantExpr::getCast(Cast0->getOpcode(), C, Cast0->getType());

	return nullptr;
	}

	/// Given operands for an And, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyAndInst(Value Op0, Value *Op1, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q))
	return C;

	// X & undef -> 0
	if (match(Op1, m_Undef()))
	return Constant::getNullValue(Op0->getType());

	// X & X = X
	if (Op0 == Op1)
	return Op0;

	// X & 0 = 0
	if (match(Op1, m_Zero()))
	return Constant::getNullValue(Op0->getType());

	// X & -1 = X
	if (match(Op1, m_AllOnes()))
	return Op0;

	// A & ~A = ~A & A = 0
	if (match(Op0, m_Not(m_Specific(Op1))) \|\|
	match(Op1, m_Not(m_Specific(Op0))))
	return Constant::getNullValue(Op0->getType());

	// (A \| ?) & A = A
	if (match(Op0, m_c_Or(m_Specific(Op1), m_Value())))
	return Op1;

	// A & (A \| ?) = A
	if (match(Op1, m_c_Or(m_Specific(Op0), m_Value())))
	return Op0;

	// A mask that only clears known zeros of a shifted value is a no-op.
	Value *X;
	const APInt *Mask;
	const APInt *ShAmt;
	if (match(Op1, m_APInt(Mask))) {
	// If all bits in the inverted and shifted mask are clear:
	// and (shl X, ShAmt), Mask --> shl X, ShAmt
	if (match(Op0, m_Shl(m_Value(X), m_APInt(ShAmt))) &&
	(~(Mask)).lshr(ShAmt).isNullValue())
	return Op0;

	// If all bits in the inverted and shifted mask are clear:
	// and (lshr X, ShAmt), Mask --> lshr X, ShAmt
	if (match(Op0, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
	(~(Mask)).shl(ShAmt).isNullValue())
	return Op0;
	}

	// A & (-A) = A if A is a power of two or zero.
	if (match(Op0, m_Neg(m_Specific(Op1))) \|\|
	match(Op1, m_Neg(m_Specific(Op0)))) {
	if (isKnownToBeAPowerOfTwo(Op0, Q.DL, /OrZero/ true, 0, Q.AC, Q.CxtI,
	Q.DT))
	return Op0;
	if (isKnownToBeAPowerOfTwo(Op1, Q.DL, /OrZero/ true, 0, Q.AC, Q.CxtI,
	Q.DT))
	return Op1;
	}

	// This is a similar pattern used for checking if a value is a power-of-2:
	// (A - 1) & A --> 0 (if A is a power-of-2 or 0)
	// A & (A - 1) --> 0 (if A is a power-of-2 or 0)
	if (match(Op0, m_Add(m_Specific(Op1), m_AllOnes())) &&
	isKnownToBeAPowerOfTwo(Op1, Q.DL, /OrZero/ true, 0, Q.AC, Q.CxtI, Q.DT))
	return Constant::getNullValue(Op1->getType());
	if (match(Op1, m_Add(m_Specific(Op0), m_AllOnes())) &&
	isKnownToBeAPowerOfTwo(Op0, Q.DL, /OrZero/ true, 0, Q.AC, Q.CxtI, Q.DT))
	return Constant::getNullValue(Op0->getType());

	if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, true))
	return V;

	// Try some generic simplifications for associative operations.
	if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// And distributes over Or. Try some generic simplifications based on this.
	if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Or,
	Q, MaxRecurse))
	return V;

	// And distributes over Xor. Try some generic simplifications based on this.
	if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Xor,
	Q, MaxRecurse))
	return V;

	// If the operation is with the result of a select instruction, check whether
	// operating on either branch of the select always yields the same value.
	if (isa<SelectInst>(Op0) \|\| isa<SelectInst>(Op1))
	if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// If the operation is with the result of a phi instruction, check whether
	// operating on all incoming values of the phi always yields the same value.
	if (isa<PHINode>(Op0) \|\| isa<PHINode>(Op1))
	if (Value *V = ThreadBinOpOverPHI(Instruction::And, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// Assuming the effective width of Y is not larger than A, i.e. all bits
	// from X and Y are disjoint in (X << A) \| Y,
	// if the mask of this AND op covers all bits of X or Y, while it covers
	// no bits from the other, we can bypass this AND op. E.g.,
	// ((X << A) \| Y) & Mask -> Y,
	// if Mask = ((1 << effective_width_of(Y)) - 1)
	// ((X << A) \| Y) & Mask -> X << A,
	// if Mask = ((1 << effective_width_of(X)) - 1) << A
	// SimplifyDemandedBits in InstCombine can optimize the general case.
	// This pattern aims to help other passes for a common case.
	Value Y, XShifted;
	if (match(Op1, m_APInt(Mask)) &&
	match(Op0, m_c_Or(m_CombineAnd(m_NUWShl(m_Value(X), m_APInt(ShAmt)),
	m_Value(XShifted)),
	m_Value(Y)))) {
	const unsigned Width = Op0->getType()->getScalarSizeInBits();
	const unsigned ShftCnt = ShAmt->getLimitedValue(Width);
	const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
	if (EffWidthY <= ShftCnt) {
	const KnownBits XKnown = computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI,
	Q.DT);
	const unsigned EffWidthX = Width - XKnown.countMinLeadingZeros();
	const APInt EffBitsY = APInt::getLowBitsSet(Width, EffWidthY);
	const APInt EffBitsX = APInt::getLowBitsSet(Width, EffWidthX) << ShftCnt;
	// If the mask is extracting all bits from X or Y as is, we can skip
	// this AND op.
	if (EffBitsY.isSubsetOf(Mask) && !EffBitsX.intersects(Mask))
	return Y;
	if (EffBitsX.isSubsetOf(Mask) && !EffBitsY.intersects(Mask))
	return XShifted;
	}
	}

	return nullptr;
	}

	Value llvm::SimplifyAndInst(Value Op0, Value *Op1, const SimplifyQuery &Q) {
	return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit);
	}

	/// Given operands for an Or, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyOrInst(Value Op0, Value *Op1, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q))
	return C;

	// X \| undef -> -1
	// X \| -1 = -1
	// Do not return Op1 because it may contain undef elements if it's a vector.
	if (match(Op1, m_Undef()) \|\| match(Op1, m_AllOnes()))
	return Constant::getAllOnesValue(Op0->getType());

	// X \| X = X
	// X \| 0 = X
	if (Op0 == Op1 \|\| match(Op1, m_Zero()))
	return Op0;

	// A \| ~A = ~A \| A = -1
	if (match(Op0, m_Not(m_Specific(Op1))) \|\|
	match(Op1, m_Not(m_Specific(Op0))))
	return Constant::getAllOnesValue(Op0->getType());

	// (A & ?) \| A = A
	if (match(Op0, m_c_And(m_Specific(Op1), m_Value())))
	return Op1;

	// A \| (A & ?) = A
	if (match(Op1, m_c_And(m_Specific(Op0), m_Value())))
	return Op0;

	// ~(A & ?) \| A = -1
	if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
	return Constant::getAllOnesValue(Op1->getType());

	// A \| ~(A & ?) = -1
	if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
	return Constant::getAllOnesValue(Op0->getType());

	Value A, B;
	// (A & ~B) \| (A ^ B) -> (A ^ B)
	// (~B & A) \| (A ^ B) -> (A ^ B)
	// (A & ~B) \| (B ^ A) -> (B ^ A)
	// (~B & A) \| (B ^ A) -> (B ^ A)
	if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
	(match(Op0, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) \|\|
	match(Op0, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
	return Op1;

	// Commute the 'or' operands.
	// (A ^ B) \| (A & ~B) -> (A ^ B)
	// (A ^ B) \| (~B & A) -> (A ^ B)
	// (B ^ A) \| (A & ~B) -> (B ^ A)
	// (B ^ A) \| (~B & A) -> (B ^ A)
	if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
	(match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) \|\|
	match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
	return Op0;

	// (A & B) \| (~A ^ B) -> (~A ^ B)
	// (B & A) \| (~A ^ B) -> (~A ^ B)
	// (A & B) \| (B ^ ~A) -> (B ^ ~A)
	// (B & A) \| (B ^ ~A) -> (B ^ ~A)
	if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
	(match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) \|\|
	match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
	return Op1;

	// (~A ^ B) \| (A & B) -> (~A ^ B)
	// (~A ^ B) \| (B & A) -> (~A ^ B)
	// (B ^ ~A) \| (A & B) -> (B ^ ~A)
	// (B ^ ~A) \| (B & A) -> (B ^ ~A)
	if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
	(match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) \|\|
	match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
	return Op0;

	if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false))
	return V;

	// Try some generic simplifications for associative operations.
	if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// Or distributes over And. Try some generic simplifications based on this.
	if (Value *V = ExpandBinOp(Instruction::Or, Op0, Op1, Instruction::And, Q,
	MaxRecurse))
	return V;

	// If the operation is with the result of a select instruction, check whether
	// operating on either branch of the select always yields the same value.
	if (isa<SelectInst>(Op0) \|\| isa<SelectInst>(Op1))
	if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// (A & C1)\|(B & C2)
	const APInt C1, C2;
	if (match(Op0, m_And(m_Value(A), m_APInt(C1))) &&
	match(Op1, m_And(m_Value(B), m_APInt(C2)))) {
	if (C1 == ~C2) {
	// (A & C1)\|(B & C2)
	// If we have: ((V + N) & C1) \| (V & C2)
	// .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
	// replace with V+N.
	Value *N;
	if (C2->isMask() && // C2 == 0+1+
	match(A, m_c_Add(m_Specific(B), m_Value(N)))) {
	// Add commutes, try both ways.
	if (MaskedValueIsZero(N, *C2, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
	return A;
	}
	// Or commutes, try both ways.
	if (C1->isMask() &&
	match(B, m_c_Add(m_Specific(A), m_Value(N)))) {
	// Add commutes, try both ways.
	if (MaskedValueIsZero(N, *C1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
	return B;
	}
	}
	}

	// If the operation is with the result of a phi instruction, check whether
	// operating on all incoming values of the phi always yields the same value.
	if (isa<PHINode>(Op0) \|\| isa<PHINode>(Op1))
	if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
	return V;

	return nullptr;
	}

	Value llvm::SimplifyOrInst(Value Op0, Value *Op1, const SimplifyQuery &Q) {
	return ::SimplifyOrInst(Op0, Op1, Q, RecursionLimit);
	}

	/// Given operands for a Xor, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyXorInst(Value Op0, Value *Op1, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q))
	return C;

	// A ^ undef -> undef
	if (match(Op1, m_Undef()))
	return Op1;

	// A ^ 0 = A
	if (match(Op1, m_Zero()))
	return Op0;

	// A ^ A = 0
	if (Op0 == Op1)
	return Constant::getNullValue(Op0->getType());

	// A ^ ~A = ~A ^ A = -1
	if (match(Op0, m_Not(m_Specific(Op1))) \|\|
	match(Op1, m_Not(m_Specific(Op0))))
	return Constant::getAllOnesValue(Op0->getType());

	// Try some generic simplifications for associative operations.
	if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q,
	MaxRecurse))
	return V;

	// Threading Xor over selects and phi nodes is pointless, so don't bother.
	// Threading over the select in "A ^ select(cond, B, C)" means evaluating
	// "A^B" and "A^C" and seeing if they are equal; but they are equal if and
	// only if B and C are equal. If B and C are equal then (since we assume
	// that operands have already been simplified) "select(cond, B, C)" should
	// have been simplified to the common value of B and C already. Analysing
	// "A^B" and "A^C" thus gains nothing, but costs compile time. Similarly
	// for threading over phi nodes.

	return nullptr;
	}

	Value llvm::SimplifyXorInst(Value Op0, Value *Op1, const SimplifyQuery &Q) {
	return ::SimplifyXorInst(Op0, Op1, Q, RecursionLimit);
	}


	static Type GetCompareTy(Value Op) {
	return CmpInst::makeCmpResultType(Op->getType());
	}

	/// Rummage around inside V looking for something equivalent to the comparison
	/// "LHS Pred RHS". Return such a value if found, otherwise return null.
	/// Helper function for analyzing max/min idioms.
	static Value ExtractEquivalentCondition(Value V, CmpInst::Predicate Pred,
	Value LHS, Value RHS) {
	SelectInst *SI = dyn_cast<SelectInst>(V);
	if (!SI)
	return nullptr;
	CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
	if (!Cmp)
	return nullptr;
	Value CmpLHS = Cmp->getOperand(0), CmpRHS = Cmp->getOperand(1);
	if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS)
	return Cmp;
	if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) &&
	LHS == CmpRHS && RHS == CmpLHS)
	return Cmp;
	return nullptr;
	}

	// A significant optimization not implemented here is assuming that alloca
	// addresses are not equal to incoming argument values. They don't alias,
	// as we say, but that doesn't mean they aren't equal, so we take a
	// conservative approach.
	//
	// This is inspired in part by C++11 5.10p1:
	// "Two pointers of the same type compare equal if and only if they are both
	// null, both point to the same function, or both represent the same
	// address."
	//
	// This is pretty permissive.
	//
	// It's also partly due to C11 6.5.9p6:
	// "Two pointers compare equal if and only if both are null pointers, both are
	// pointers to the same object (including a pointer to an object and a
	// subobject at its beginning) or function, both are pointers to one past the
	// last element of the same array object, or one is a pointer to one past the
	// end of one array object and the other is a pointer to the start of a
	// different array object that happens to immediately follow the first array
	// object in the address space.)
	//
	// C11's version is more restrictive, however there's no reason why an argument
	// couldn't be a one-past-the-end value for a stack object in the caller and be
	// equal to the beginning of a stack object in the callee.
	//
	// If the C and C++ standards are ever made sufficiently restrictive in this
	// area, it may be possible to update LLVM's semantics accordingly and reinstate
	// this optimization.
	static Constant *
	computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI,
	const DominatorTree *DT, CmpInst::Predicate Pred,
	AssumptionCache AC, const Instruction CxtI,
	const InstrInfoQuery &IIQ, Value LHS, Value RHS) {
	// First, skip past any trivial no-ops.
	LHS = LHS->stripPointerCasts();
	RHS = RHS->stripPointerCasts();

	// A non-null pointer is not equal to a null pointer.
	if (llvm::isKnownNonZero(LHS, DL, 0, nullptr, nullptr, nullptr,
	IIQ.UseInstrInfo) &&
	isa<ConstantPointerNull>(RHS) &&
	(Pred == CmpInst::ICMP_EQ \|\| Pred == CmpInst::ICMP_NE))
	return ConstantInt::get(GetCompareTy(LHS),
	!CmpInst::isTrueWhenEqual(Pred));

	// We can only fold certain predicates on pointer comparisons.
	switch (Pred) {
	default:
	return nullptr;

	// Equality comaprisons are easy to fold.
	case CmpInst::ICMP_EQ:
	case CmpInst::ICMP_NE:
	break;

	// We can only handle unsigned relational comparisons because 'inbounds' on
	// a GEP only protects against unsigned wrapping.
	case CmpInst::ICMP_UGT:
	case CmpInst::ICMP_UGE:
	case CmpInst::ICMP_ULT:
	case CmpInst::ICMP_ULE:
	// However, we have to switch them to their signed variants to handle
	// negative indices from the base pointer.
	Pred = ICmpInst::getSignedPredicate(Pred);
	break;
	}

	// Strip off any constant offsets so that we can reason about them.
	// It's tempting to use getUnderlyingObject or even just stripInBoundsOffsets
	// here and compare base addresses like AliasAnalysis does, however there are
	// numerous hazards. AliasAnalysis and its utilities rely on special rules
	// governing loads and stores which don't apply to icmps. Also, AliasAnalysis
	// doesn't need to guarantee pointer inequality when it says NoAlias.
	Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
	Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);

	// If LHS and RHS are related via constant offsets to the same base
	// value, we can replace it with an icmp which just compares the offsets.
	if (LHS == RHS)
	return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);

	// Various optimizations for (in)equality comparisons.
	if (Pred == CmpInst::ICMP_EQ \|\| Pred == CmpInst::ICMP_NE) {
	// Different non-empty allocations that exist at the same time have
	// different addresses (if the program can tell). Global variables always
	// exist, so they always exist during the lifetime of each other and all
	// allocas. Two different allocas usually have different addresses...
	//
	// However, if there's an @llvm.stackrestore dynamically in between two
	// allocas, they may have the same address. It's tempting to reduce the
	// scope of the problem by only looking at static allocas here. That would
	// cover the majority of allocas while significantly reducing the likelihood
	// of having an @llvm.stackrestore pop up in the middle. However, it's not
	// actually impossible for an @llvm.stackrestore to pop up in the middle of
	// an entry block. Also, if we have a block that's not attached to a
	// function, we can't tell if it's "static" under the current definition.
	// Theoretically, this problem could be fixed by creating a new kind of
	// instruction kind specifically for static allocas. Such a new instruction
	// could be required to be at the top of the entry block, thus preventing it
	// from being subject to a @llvm.stackrestore. Instcombine could even
	// convert regular allocas into these special allocas. It'd be nifty.
	// However, until then, this problem remains open.
	//
	// So, we'll assume that two non-empty allocas have different addresses
	// for now.
	//
	// With all that, if the offsets are within the bounds of their allocations
	// (and not one-past-the-end! so we can't use inbounds!), and their
	// allocations aren't the same, the pointers are not equal.
	//
	// Note that it's not necessary to check for LHS being a global variable
	// address, due to canonicalization and constant folding.
	if (isa<AllocaInst>(LHS) &&
	(isa<AllocaInst>(RHS) \|\| isa<GlobalVariable>(RHS))) {
	ConstantInt *LHSOffsetCI = dyn_cast<ConstantInt>(LHSOffset);
	ConstantInt *RHSOffsetCI = dyn_cast<ConstantInt>(RHSOffset);
	uint64_t LHSSize, RHSSize;
	ObjectSizeOpts Opts;
	Opts.NullIsUnknownSize =
	NullPointerIsDefined(cast<AllocaInst>(LHS)->getFunction());
	if (LHSOffsetCI && RHSOffsetCI &&
	getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
	getObjectSize(RHS, RHSSize, DL, TLI, Opts)) {
	const APInt &LHSOffsetValue = LHSOffsetCI->getValue();
	const APInt &RHSOffsetValue = RHSOffsetCI->getValue();
	if (!LHSOffsetValue.isNegative() &&
	!RHSOffsetValue.isNegative() &&
	LHSOffsetValue.ult(LHSSize) &&
	RHSOffsetValue.ult(RHSSize)) {
	return ConstantInt::get(GetCompareTy(LHS),
	!CmpInst::isTrueWhenEqual(Pred));
	}
	}

	// Repeat the above check but this time without depending on DataLayout
	// or being able to compute a precise size.
	if (!cast<PointerType>(LHS->getType())->isEmptyTy() &&
	!cast<PointerType>(RHS->getType())->isEmptyTy() &&
	LHSOffset->isNullValue() &&
	RHSOffset->isNullValue())
	return ConstantInt::get(GetCompareTy(LHS),
	!CmpInst::isTrueWhenEqual(Pred));
	}

	// Even if an non-inbounds GEP occurs along the path we can still optimize
	// equality comparisons concerning the result. We avoid walking the whole
	// chain again by starting where the last calls to
	// stripAndComputeConstantOffsets left off and accumulate the offsets.
	Constant *LHSNoBound = stripAndComputeConstantOffsets(DL, LHS, true);
	Constant *RHSNoBound = stripAndComputeConstantOffsets(DL, RHS, true);
	if (LHS == RHS)
	return ConstantExpr::getICmp(Pred,
	ConstantExpr::getAdd(LHSOffset, LHSNoBound),
	ConstantExpr::getAdd(RHSOffset, RHSNoBound));

	// If one side of the equality comparison must come from a noalias call
	// (meaning a system memory allocation function), and the other side must
	// come from a pointer that cannot overlap with dynamically-allocated
	// memory within the lifetime of the current function (allocas, byval
	// arguments, globals), then determine the comparison result here.
	SmallVector<const Value *, 8> LHSUObjs, RHSUObjs;
	GetUnderlyingObjects(LHS, LHSUObjs, DL);
	GetUnderlyingObjects(RHS, RHSUObjs, DL);

	// Is the set of underlying objects all noalias calls?
	auto IsNAC = [](ArrayRef<const Value *> Objects) {
	return all_of(Objects, isNoAliasCall);
	};

	// Is the set of underlying objects all things which must be disjoint from
	// noalias calls. For allocas, we consider only static ones (dynamic
	// allocas might be transformed into calls to malloc not simultaneously
	// live with the compared-to allocation). For globals, we exclude symbols
	// that might be resolve lazily to symbols in another dynamically-loaded
	// library (and, thus, could be malloc'ed by the implementation).
	auto IsAllocDisjoint = [](ArrayRef<const Value *> Objects) {
	return all_of(Objects, [](const Value *V) {
	if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
	return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
	return (GV->hasLocalLinkage() \|\| GV->hasHiddenVisibility() \|\|
	GV->hasProtectedVisibility() \|\| GV->hasGlobalUnnamedAddr()) &&
	!GV->isThreadLocal();
	if (const Argument *A = dyn_cast<Argument>(V))
	return A->hasByValAttr();
	return false;
	});
	};

	if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) \|\|
	(IsNAC(RHSUObjs) && IsAllocDisjoint(LHSUObjs)))
	return ConstantInt::get(GetCompareTy(LHS),
	!CmpInst::isTrueWhenEqual(Pred));

	// Fold comparisons for non-escaping pointer even if the allocation call
	// cannot be elided. We cannot fold malloc comparison to null. Also, the
	// dynamic allocation call could be either of the operands.
	Value *MI = nullptr;
	if (isAllocLikeFn(LHS, TLI) &&
	llvm::isKnownNonZero(RHS, DL, 0, nullptr, CxtI, DT))
	MI = LHS;
	else if (isAllocLikeFn(RHS, TLI) &&
	llvm::isKnownNonZero(LHS, DL, 0, nullptr, CxtI, DT))
	MI = RHS;
	// FIXME: We should also fold the compare when the pointer escapes, but the
	// compare dominates the pointer escape
	if (MI && !PointerMayBeCaptured(MI, true, true))
	return ConstantInt::get(GetCompareTy(LHS),
	CmpInst::isFalseWhenEqual(Pred));
	}

	// Otherwise, fail.
	return nullptr;
	}

	/// Fold an icmp when its operands have i1 scalar type.
	static Value simplifyICmpOfBools(CmpInst::Predicate Pred, Value LHS,
	Value *RHS, const SimplifyQuery &Q) {
	Type *ITy = GetCompareTy(LHS); // The return type.
	Type *OpTy = LHS->getType(); // The operand type.
	if (!OpTy->isIntOrIntVectorTy(1))
	return nullptr;

	// A boolean compared to true/false can be simplified in 14 out of the 20
	// (10 predicates * 2 constants) possible combinations. Cases not handled here
	// require a 'not' of the LHS, so those must be transformed in InstCombine.
	if (match(RHS, m_Zero())) {
	switch (Pred) {
	case CmpInst::ICMP_NE: // X != 0 -> X
	case CmpInst::ICMP_UGT: // X >u 0 -> X
	case CmpInst::ICMP_SLT: // X <s 0 -> X
	return LHS;

	case CmpInst::ICMP_ULT: // X <u 0 -> false
	case CmpInst::ICMP_SGT: // X >s 0 -> false
	return getFalse(ITy);

	case CmpInst::ICMP_UGE: // X >=u 0 -> true
	case CmpInst::ICMP_SLE: // X <=s 0 -> true
	return getTrue(ITy);

	default: break;
	}
	} else if (match(RHS, m_One())) {
	switch (Pred) {
	case CmpInst::ICMP_EQ: // X == 1 -> X
	case CmpInst::ICMP_UGE: // X >=u 1 -> X
	case CmpInst::ICMP_SLE: // X <=s -1 -> X
	return LHS;

	case CmpInst::ICMP_UGT: // X >u 1 -> false
	case CmpInst::ICMP_SLT: // X <s -1 -> false
	return getFalse(ITy);

	case CmpInst::ICMP_ULE: // X <=u 1 -> true
	case CmpInst::ICMP_SGE: // X >=s -1 -> true
	return getTrue(ITy);

	default: break;
	}
	}

	switch (Pred) {
	default:
	break;
	case ICmpInst::ICMP_UGE:
	if (isImpliedCondition(RHS, LHS, Q.DL).getValueOr(false))
	return getTrue(ITy);
	break;
	case ICmpInst::ICMP_SGE:
	/// For signed comparison, the values for an i1 are 0 and -1
	/// respectively. This maps into a truth table of:
	/// LHS \| RHS \| LHS >=s RHS \| LHS implies RHS
	/// 0 \| 0 \| 1 (0 >= 0) \| 1
	/// 0 \| 1 \| 1 (0 >= -1) \| 1
	/// 1 \| 0 \| 0 (-1 >= 0) \| 0
	/// 1 \| 1 \| 1 (-1 >= -1) \| 1
	if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
	return getTrue(ITy);
	break;
	case ICmpInst::ICMP_ULE:
	if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
	return getTrue(ITy);
	break;
	}

	return nullptr;
	}

	/// Try hard to fold icmp with zero RHS because this is a common case.
	static Value simplifyICmpWithZero(CmpInst::Predicate Pred, Value LHS,
	Value *RHS, const SimplifyQuery &Q) {
	if (!match(RHS, m_Zero()))
	return nullptr;

	Type *ITy = GetCompareTy(LHS); // The return type.
	switch (Pred) {
	default:
	llvm_unreachable("Unknown ICmp predicate!");
	case ICmpInst::ICMP_ULT:
	return getFalse(ITy);
	case ICmpInst::ICMP_UGE:
	return getTrue(ITy);
	case ICmpInst::ICMP_EQ:
	case ICmpInst::ICMP_ULE:
	if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT, Q.IIQ.UseInstrInfo))
	return getFalse(ITy);
	break;
	case ICmpInst::ICMP_NE:
	case ICmpInst::ICMP_UGT:
	if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT, Q.IIQ.UseInstrInfo))
	return getTrue(ITy);
	break;
	case ICmpInst::ICMP_SLT: {
	KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (LHSKnown.isNegative())
	return getTrue(ITy);
	if (LHSKnown.isNonNegative())
	return getFalse(ITy);
	break;
	}
	case ICmpInst::ICMP_SLE: {
	KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (LHSKnown.isNegative())
	return getTrue(ITy);
	if (LHSKnown.isNonNegative() &&
	isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
	return getFalse(ITy);
	break;
	}
	case ICmpInst::ICMP_SGE: {
	KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (LHSKnown.isNegative())
	return getFalse(ITy);
	if (LHSKnown.isNonNegative())
	return getTrue(ITy);
	break;
	}
	case ICmpInst::ICMP_SGT: {
	KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (LHSKnown.isNegative())
	return getFalse(ITy);
	if (LHSKnown.isNonNegative() &&
	isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
	return getTrue(ITy);
	break;
	}
	}

	return nullptr;
	}

	static Value simplifyICmpWithConstant(CmpInst::Predicate Pred, Value LHS,
	Value *RHS, const InstrInfoQuery &IIQ) {
	Type *ITy = GetCompareTy(RHS); // The return type.

	Value *X;
	// Sign-bit checks can be optimized to true/false after unsigned
	// floating-point casts:
	// icmp slt (bitcast (uitofp X)), 0 --> false
	// icmp sgt (bitcast (uitofp X)), -1 --> true
	if (match(LHS, m_BitCast(m_UIToFP(m_Value(X))))) {
	if (Pred == ICmpInst::ICMP_SLT && match(RHS, m_Zero()))
	return ConstantInt::getFalse(ITy);
	if (Pred == ICmpInst::ICMP_SGT && match(RHS, m_AllOnes()))
	return ConstantInt::getTrue(ITy);
	}

	const APInt *C;
	if (!match(RHS, m_APInt(C)))
	return nullptr;

	// Rule out tautological comparisons (eg., ult 0 or uge 0).
	ConstantRange RHS_CR = ConstantRange::makeExactICmpRegion(Pred, *C);
	if (RHS_CR.isEmptySet())
	return ConstantInt::getFalse(ITy);
	if (RHS_CR.isFullSet())
	return ConstantInt::getTrue(ITy);

	ConstantRange LHS_CR = computeConstantRange(LHS, IIQ.UseInstrInfo);
	if (!LHS_CR.isFullSet()) {
	if (RHS_CR.contains(LHS_CR))
	return ConstantInt::getTrue(ITy);
	if (RHS_CR.inverse().contains(LHS_CR))
	return ConstantInt::getFalse(ITy);
	}

	return nullptr;
	}

	/// TODO: A large part of this logic is duplicated in InstCombine's
	/// foldICmpBinOp(). We should be able to share that and avoid the code
	/// duplication.
	static Value simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value LHS,
	Value *RHS, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	Type *ITy = GetCompareTy(LHS); // The return type.

	BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
	BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
	if (MaxRecurse && (LBO \|\| RBO)) {
	// Analyze the case when either LHS or RHS is an add instruction.
	Value A = nullptr, B = nullptr, C = nullptr, D = nullptr;
	// LHS = A + B (or A and B are null); RHS = C + D (or C and D are null).
	bool NoLHSWrapProblem = false, NoRHSWrapProblem = false;
	if (LBO && LBO->getOpcode() == Instruction::Add) {
	A = LBO->getOperand(0);
	B = LBO->getOperand(1);
	NoLHSWrapProblem =
	ICmpInst::isEquality(Pred) \|\|
	(CmpInst::isUnsigned(Pred) &&
	Q.IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(LBO))) \|\|
	(CmpInst::isSigned(Pred) &&
	Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(LBO)));
	}
	if (RBO && RBO->getOpcode() == Instruction::Add) {
	C = RBO->getOperand(0);
	D = RBO->getOperand(1);
	NoRHSWrapProblem =
	ICmpInst::isEquality(Pred) \|\|
	(CmpInst::isUnsigned(Pred) &&
	Q.IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(RBO))) \|\|
	(CmpInst::isSigned(Pred) &&
	Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(RBO)));
	}

	// icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
	if ((A == RHS \|\| B == RHS) && NoLHSWrapProblem)
	if (Value *V = SimplifyICmpInst(Pred, A == RHS ? B : A,
	Constant::getNullValue(RHS->getType()), Q,
	MaxRecurse - 1))
	return V;

	// icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
	if ((C == LHS \|\| D == LHS) && NoRHSWrapProblem)
	if (Value *V =
	SimplifyICmpInst(Pred, Constant::getNullValue(LHS->getType()),
	C == LHS ? D : C, Q, MaxRecurse - 1))
	return V;

	// icmp (X+Y), (X+Z) -> icmp Y,Z for equalities or if there is no overflow.
	if (A && C && (A == C \|\| A == D \|\| B == C \|\| B == D) && NoLHSWrapProblem &&
	NoRHSWrapProblem) {
	// Determine Y and Z in the form icmp (X+Y), (X+Z).
	Value Y, Z;
	if (A == C) {
	// C + B == C + D -> B == D
	Y = B;
	Z = D;
	} else if (A == D) {
	// D + B == C + D -> B == C
	Y = B;
	Z = C;
	} else if (B == C) {
	// A + C == C + D -> A == D
	Y = A;
	Z = D;
	} else {
	assert(B == D);
	// A + D == C + D -> A == C
	Y = A;
	Z = C;
	}
	if (Value *V = SimplifyICmpInst(Pred, Y, Z, Q, MaxRecurse - 1))
	return V;
	}
	}

	{
	Value *Y = nullptr;
	// icmp pred (or X, Y), X
	if (LBO && match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) {
	if (Pred == ICmpInst::ICMP_ULT)
	return getFalse(ITy);
	if (Pred == ICmpInst::ICMP_UGE)
	return getTrue(ITy);

	if (Pred == ICmpInst::ICMP_SLT \|\| Pred == ICmpInst::ICMP_SGE) {
	KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (RHSKnown.isNonNegative() && YKnown.isNegative())
	return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
	if (RHSKnown.isNegative() \|\| YKnown.isNonNegative())
	return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
	}
	}
	// icmp pred X, (or X, Y)
	if (RBO && match(RBO, m_c_Or(m_Value(Y), m_Specific(LHS)))) {
	if (Pred == ICmpInst::ICMP_ULE)
	return getTrue(ITy);
	if (Pred == ICmpInst::ICMP_UGT)
	return getFalse(ITy);

	if (Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SLE) {
	KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (LHSKnown.isNonNegative() && YKnown.isNegative())
	return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
	if (LHSKnown.isNegative() \|\| YKnown.isNonNegative())
	return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
	}
	}
	}

	// icmp pred (and X, Y), X
	if (LBO && match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) {
	if (Pred == ICmpInst::ICMP_UGT)
	return getFalse(ITy);
	if (Pred == ICmpInst::ICMP_ULE)
	return getTrue(ITy);
	}
	// icmp pred X, (and X, Y)
	if (RBO && match(RBO, m_c_And(m_Value(), m_Specific(LHS)))) {
	if (Pred == ICmpInst::ICMP_UGE)
	return getTrue(ITy);
	if (Pred == ICmpInst::ICMP_ULT)
	return getFalse(ITy);
	}

	// 0 - (zext X) pred C
	if (!CmpInst::isUnsigned(Pred) && match(LHS, m_Neg(m_ZExt(m_Value())))) {
	if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
	if (RHSC->getValue().isStrictlyPositive()) {
	if (Pred == ICmpInst::ICMP_SLT)
	return ConstantInt::getTrue(RHSC->getContext());
	if (Pred == ICmpInst::ICMP_SGE)
	return ConstantInt::getFalse(RHSC->getContext());
	if (Pred == ICmpInst::ICMP_EQ)
	return ConstantInt::getFalse(RHSC->getContext());
	if (Pred == ICmpInst::ICMP_NE)
	return ConstantInt::getTrue(RHSC->getContext());
	}
	if (RHSC->getValue().isNonNegative()) {
	if (Pred == ICmpInst::ICMP_SLE)
	return ConstantInt::getTrue(RHSC->getContext());
	if (Pred == ICmpInst::ICMP_SGT)
	return ConstantInt::getFalse(RHSC->getContext());
	}
	}
	}

	// icmp pred (urem X, Y), Y
	if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
	switch (Pred) {
	default:
	break;
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE: {
	KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (!Known.isNonNegative())
	break;
	LLVM_FALLTHROUGH;
	}
	case ICmpInst::ICMP_EQ:
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE:
	return getFalse(ITy);
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE: {
	KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (!Known.isNonNegative())
	break;
	LLVM_FALLTHROUGH;
	}
	case ICmpInst::ICMP_NE:
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE:
	return getTrue(ITy);
	}
	}

	// icmp pred X, (urem Y, X)
	if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
	switch (Pred) {
	default:
	break;
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE: {
	KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (!Known.isNonNegative())
	break;
	LLVM_FALLTHROUGH;
	}
	case ICmpInst::ICMP_NE:
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE:
	return getTrue(ITy);
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE: {
	KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
	if (!Known.isNonNegative())
	break;
	LLVM_FALLTHROUGH;
	}
	case ICmpInst::ICMP_EQ:
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE:
	return getFalse(ITy);
	}
	}

	// x >> y <=u x
	// x udiv y <=u x.
	if (LBO && (match(LBO, m_LShr(m_Specific(RHS), m_Value())) \|\|
	match(LBO, m_UDiv(m_Specific(RHS), m_Value())))) {
	// icmp pred (X op Y), X
	if (Pred == ICmpInst::ICMP_UGT)
	return getFalse(ITy);
	if (Pred == ICmpInst::ICMP_ULE)
	return getTrue(ITy);
	}

	// x >=u x >> y
	// x >=u x udiv y.
	if (RBO && (match(RBO, m_LShr(m_Specific(LHS), m_Value())) \|\|
	match(RBO, m_UDiv(m_Specific(LHS), m_Value())))) {
	// icmp pred X, (X op Y)
	if (Pred == ICmpInst::ICMP_ULT)
	return getFalse(ITy);
	if (Pred == ICmpInst::ICMP_UGE)
	return getTrue(ITy);
	}

	// handle:
	// CI2 << X == CI
	// CI2 << X != CI
	//
	// where CI2 is a power of 2 and CI isn't
	if (auto *CI = dyn_cast<ConstantInt>(RHS)) {
	const APInt CI2Val, CIVal = &CI->getValue();
	if (LBO && match(LBO, m_Shl(m_APInt(CI2Val), m_Value())) &&
	CI2Val->isPowerOf2()) {
	if (!CIVal->isPowerOf2()) {
	// CI2 << X can equal zero in some circumstances,
	// this simplification is unsafe if CI is zero.
	//
	// We know it is safe if:
	// - The shift is nsw, we can't shift out the one bit.
	// - The shift is nuw, we can't shift out the one bit.
	// - CI2 is one
	// - CI isn't zero
	if (Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(LBO)) \|\|
	Q.IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(LBO)) \|\|
	CI2Val->isOneValue() \|\| !CI->isZero()) {
	if (Pred == ICmpInst::ICMP_EQ)
	return ConstantInt::getFalse(RHS->getContext());
	if (Pred == ICmpInst::ICMP_NE)
	return ConstantInt::getTrue(RHS->getContext());
	}
	}
	if (CIVal->isSignMask() && CI2Val->isOneValue()) {
	if (Pred == ICmpInst::ICMP_UGT)
	return ConstantInt::getFalse(RHS->getContext());
	if (Pred == ICmpInst::ICMP_ULE)
	return ConstantInt::getTrue(RHS->getContext());
	}
	}
	}

	if (MaxRecurse && LBO && RBO && LBO->getOpcode() == RBO->getOpcode() &&
	LBO->getOperand(1) == RBO->getOperand(1)) {
	switch (LBO->getOpcode()) {
	default:
	break;
	case Instruction::UDiv:
	case Instruction::LShr:
	if (ICmpInst::isSigned(Pred) \|\| !Q.IIQ.isExact(LBO) \|\|
	!Q.IIQ.isExact(RBO))
	break;
	if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
	RBO->getOperand(0), Q, MaxRecurse - 1))
	return V;
	break;
	case Instruction::SDiv:
	if (!ICmpInst::isEquality(Pred) \|\| !Q.IIQ.isExact(LBO) \|\|
	!Q.IIQ.isExact(RBO))
	break;
	if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
	RBO->getOperand(0), Q, MaxRecurse - 1))
	return V;
	break;
	case Instruction::AShr:
	if (!Q.IIQ.isExact(LBO) \|\| !Q.IIQ.isExact(RBO))
	break;
	if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
	RBO->getOperand(0), Q, MaxRecurse - 1))
	return V;
	break;
	case Instruction::Shl: {
	bool NUW = Q.IIQ.hasNoUnsignedWrap(LBO) && Q.IIQ.hasNoUnsignedWrap(RBO);
	bool NSW = Q.IIQ.hasNoSignedWrap(LBO) && Q.IIQ.hasNoSignedWrap(RBO);
	if (!NUW && !NSW)
	break;
	if (!NSW && ICmpInst::isSigned(Pred))
	break;
	if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
	RBO->getOperand(0), Q, MaxRecurse - 1))
	return V;
	break;
	}
	}
	}
	return nullptr;
	}

	/// Simplify integer comparisons where at least one operand of the compare
	/// matches an integer min/max idiom.
	static Value simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value LHS,
	Value *RHS, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	Type *ITy = GetCompareTy(LHS); // The return type.
	Value A, B;
	CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE;
	CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B".

	// Signed variants on "max(a,b)>=a -> true".
	if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && (A == RHS \|\| B == RHS)) {
	if (A != RHS)
	std::swap(A, B); // smax(A, B) pred A.
	EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
	// We analyze this as smax(A, B) pred A.
	P = Pred;
	} else if (match(RHS, m_SMax(m_Value(A), m_Value(B))) &&
	(A == LHS \|\| B == LHS)) {
	if (A != LHS)
	std::swap(A, B); // A pred smax(A, B).
	EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
	// We analyze this as smax(A, B) swapped-pred A.
	P = CmpInst::getSwappedPredicate(Pred);
	} else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
	(A == RHS \|\| B == RHS)) {
	if (A != RHS)
	std::swap(A, B); // smin(A, B) pred A.
	EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
	// We analyze this as smax(-A, -B) swapped-pred -A.
	// Note that we do not need to actually form -A or -B thanks to EqP.
	P = CmpInst::getSwappedPredicate(Pred);
	} else if (match(RHS, m_SMin(m_Value(A), m_Value(B))) &&
	(A == LHS \|\| B == LHS)) {
	if (A != LHS)
	std::swap(A, B); // A pred smin(A, B).
	EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
	// We analyze this as smax(-A, -B) pred -A.
	// Note that we do not need to actually form -A or -B thanks to EqP.
	P = Pred;
	}
	if (P != CmpInst::BAD_ICMP_PREDICATE) {
	// Cases correspond to "max(A, B) p A".
	switch (P) {
	default:
	break;
	case CmpInst::ICMP_EQ:
	case CmpInst::ICMP_SLE:
	// Equivalent to "A EqP B". This may be the same as the condition tested
	// in the max/min; if so, we can just return that.
	if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
	return V;
	if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
	return V;
	// Otherwise, see if "A EqP B" simplifies.
	if (MaxRecurse)
	if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
	return V;
	break;
	case CmpInst::ICMP_NE:
	case CmpInst::ICMP_SGT: {
	CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
	// Equivalent to "A InvEqP B". This may be the same as the condition
	// tested in the max/min; if so, we can just return that.
	if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
	return V;
	if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
	return V;
	// Otherwise, see if "A InvEqP B" simplifies.
	if (MaxRecurse)
	if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
	return V;
	break;
	}
	case CmpInst::ICMP_SGE:
	// Always true.
	return getTrue(ITy);
	case CmpInst::ICMP_SLT:
	// Always false.
	return getFalse(ITy);
	}
	}

	// Unsigned variants on "max(a,b)>=a -> true".
	P = CmpInst::BAD_ICMP_PREDICATE;
	if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && (A == RHS \|\| B == RHS)) {
	if (A != RHS)
	std::swap(A, B); // umax(A, B) pred A.
	EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
	// We analyze this as umax(A, B) pred A.
	P = Pred;
	} else if (match(RHS, m_UMax(m_Value(A), m_Value(B))) &&
	(A == LHS \|\| B == LHS)) {
	if (A != LHS)
	std::swap(A, B); // A pred umax(A, B).
	EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
	// We analyze this as umax(A, B) swapped-pred A.
	P = CmpInst::getSwappedPredicate(Pred);
	} else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
	(A == RHS \|\| B == RHS)) {
	if (A != RHS)
	std::swap(A, B); // umin(A, B) pred A.
	EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
	// We analyze this as umax(-A, -B) swapped-pred -A.
	// Note that we do not need to actually form -A or -B thanks to EqP.
	P = CmpInst::getSwappedPredicate(Pred);
	} else if (match(RHS, m_UMin(m_Value(A), m_Value(B))) &&
	(A == LHS \|\| B == LHS)) {
	if (A != LHS)
	std::swap(A, B); // A pred umin(A, B).
	EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
	// We analyze this as umax(-A, -B) pred -A.
	// Note that we do not need to actually form -A or -B thanks to EqP.
	P = Pred;
	}
	if (P != CmpInst::BAD_ICMP_PREDICATE) {
	// Cases correspond to "max(A, B) p A".
	switch (P) {
	default:
	break;
	case CmpInst::ICMP_EQ:
	case CmpInst::ICMP_ULE:
	// Equivalent to "A EqP B". This may be the same as the condition tested
	// in the max/min; if so, we can just return that.
	if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
	return V;
	if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
	return V;
	// Otherwise, see if "A EqP B" simplifies.
	if (MaxRecurse)
	if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1))
	return V;
	break;
	case CmpInst::ICMP_NE:
	case CmpInst::ICMP_UGT: {
	CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
	// Equivalent to "A InvEqP B". This may be the same as the condition
	// tested in the max/min; if so, we can just return that.
	if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
	return V;
	if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
	return V;
	// Otherwise, see if "A InvEqP B" simplifies.
	if (MaxRecurse)
	if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1))
	return V;
	break;
	}
	case CmpInst::ICMP_UGE:
	// Always true.
	return getTrue(ITy);
	case CmpInst::ICMP_ULT:
	// Always false.
	return getFalse(ITy);
	}
	}

	// Variants on "max(x,y) >= min(x,z)".
	Value C, D;
	if (match(LHS, m_SMax(m_Value(A), m_Value(B))) &&
	match(RHS, m_SMin(m_Value(C), m_Value(D))) &&
	(A == C \|\| A == D \|\| B == C \|\| B == D)) {
	// max(x, ?) pred min(x, ?).
	if (Pred == CmpInst::ICMP_SGE)
	// Always true.
	return getTrue(ITy);
	if (Pred == CmpInst::ICMP_SLT)
	// Always false.
	return getFalse(ITy);
	} else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
	match(RHS, m_SMax(m_Value(C), m_Value(D))) &&
	(A == C \|\| A == D \|\| B == C \|\| B == D)) {
	// min(x, ?) pred max(x, ?).
	if (Pred == CmpInst::ICMP_SLE)
	// Always true.
	return getTrue(ITy);
	if (Pred == CmpInst::ICMP_SGT)
	// Always false.
	return getFalse(ITy);
	} else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) &&
	match(RHS, m_UMin(m_Value(C), m_Value(D))) &&
	(A == C \|\| A == D \|\| B == C \|\| B == D)) {
	// max(x, ?) pred min(x, ?).
	if (Pred == CmpInst::ICMP_UGE)
	// Always true.
	return getTrue(ITy);
	if (Pred == CmpInst::ICMP_ULT)
	// Always false.
	return getFalse(ITy);
	} else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
	match(RHS, m_UMax(m_Value(C), m_Value(D))) &&
	(A == C \|\| A == D \|\| B == C \|\| B == D)) {
	// min(x, ?) pred max(x, ?).
	if (Pred == CmpInst::ICMP_ULE)
	// Always true.
	return getTrue(ITy);
	if (Pred == CmpInst::ICMP_UGT)
	// Always false.
	return getFalse(ITy);
	}

	return nullptr;
	}

	/// Given operands for an ICmpInst, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyICmpInst(unsigned Predicate, Value LHS, Value *RHS,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
	assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!");

	if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
	if (Constant *CRHS = dyn_cast<Constant>(RHS))
	return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI);

	// If we have a constant, make sure it is on the RHS.
	std::swap(LHS, RHS);
	Pred = CmpInst::getSwappedPredicate(Pred);
	}
	assert(!isa<UndefValue>(LHS) && "Unexpected icmp undef,%X");

	Type *ITy = GetCompareTy(LHS); // The return type.

	// For EQ and NE, we can always pick a value for the undef to make the
	// predicate pass or fail, so we can return undef.
	// Matches behavior in llvm::ConstantFoldCompareInstruction.
	if (isa<UndefValue>(RHS) && ICmpInst::isEquality(Pred))
	return UndefValue::get(ITy);

	// icmp X, X -> true/false
	// icmp X, undef -> true/false because undef could be X.
	if (LHS == RHS \|\| isa<UndefValue>(RHS))
	return ConstantInt::get(ITy, CmpInst::isTrueWhenEqual(Pred));

	if (Value *V = simplifyICmpOfBools(Pred, LHS, RHS, Q))
	return V;

	if (Value *V = simplifyICmpWithZero(Pred, LHS, RHS, Q))
	return V;

	if (Value *V = simplifyICmpWithConstant(Pred, LHS, RHS, Q.IIQ))
	return V;

	// If both operands have range metadata, use the metadata
	// to simplify the comparison.
	if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
	auto RHS_Instr = cast<Instruction>(RHS);
	auto LHS_Instr = cast<Instruction>(LHS);

	if (Q.IIQ.getMetadata(RHS_Instr, LLVMContext::MD_range) &&
	Q.IIQ.getMetadata(LHS_Instr, LLVMContext::MD_range)) {
	auto RHS_CR = getConstantRangeFromMetadata(
	*RHS_Instr->getMetadata(LLVMContext::MD_range));
	auto LHS_CR = getConstantRangeFromMetadata(
	*LHS_Instr->getMetadata(LLVMContext::MD_range));

	auto Satisfied_CR = ConstantRange::makeSatisfyingICmpRegion(Pred, RHS_CR);
	if (Satisfied_CR.contains(LHS_CR))
	return ConstantInt::getTrue(RHS->getContext());

	auto InversedSatisfied_CR = ConstantRange::makeSatisfyingICmpRegion(
	CmpInst::getInversePredicate(Pred), RHS_CR);
	if (InversedSatisfied_CR.contains(LHS_CR))
	return ConstantInt::getFalse(RHS->getContext());
	}
	}

	// Compare of cast, for example (zext X) != 0 -> X != 0
	if (isa<CastInst>(LHS) && (isa<Constant>(RHS) \|\| isa<CastInst>(RHS))) {
	Instruction *LI = cast<CastInst>(LHS);
	Value *SrcOp = LI->getOperand(0);
	Type *SrcTy = SrcOp->getType();
	Type *DstTy = LI->getType();

	// Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input
	// if the integer type is the same size as the pointer type.
	if (MaxRecurse && isa<PtrToIntInst>(LI) &&
	Q.DL.getTypeSizeInBits(SrcTy) == DstTy->getPrimitiveSizeInBits()) {
	if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
	// Transfer the cast to the constant.
	if (Value *V = SimplifyICmpInst(Pred, SrcOp,
	ConstantExpr::getIntToPtr(RHSC, SrcTy),
	Q, MaxRecurse-1))
	return V;
	} else if (PtrToIntInst *RI = dyn_cast<PtrToIntInst>(RHS)) {
	if (RI->getOperand(0)->getType() == SrcTy)
	// Compare without the cast.
	if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
	Q, MaxRecurse-1))
	return V;
	}
	}

	if (isa<ZExtInst>(LHS)) {
	// Turn icmp (zext X), (zext Y) into a compare of X and Y if they have the
	// same type.
	if (ZExtInst *RI = dyn_cast<ZExtInst>(RHS)) {
	if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
	// Compare X and Y. Note that signed predicates become unsigned.
	if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
	SrcOp, RI->getOperand(0), Q,
	MaxRecurse-1))
	return V;
	}
	// Turn icmp (zext X), Cst into a compare of X and Cst if Cst is extended
	// too. If not, then try to deduce the result of the comparison.
	else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
	// Compute the constant that would happen if we truncated to SrcTy then
	// reextended to DstTy.
	Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy);
	Constant *RExt = ConstantExpr::getCast(CastInst::ZExt, Trunc, DstTy);

	// If the re-extended constant didn't change then this is effectively
	// also a case of comparing two zero-extended values.
	if (RExt == CI && MaxRecurse)
	if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
	SrcOp, Trunc, Q, MaxRecurse-1))
	return V;

	// Otherwise the upper bits of LHS are zero while RHS has a non-zero bit
	// there. Use this to work out the result of the comparison.
	if (RExt != CI) {
	switch (Pred) {
	default: llvm_unreachable("Unknown ICmp predicate!");
	// LHS <u RHS.
	case ICmpInst::ICMP_EQ:
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE:
	return ConstantInt::getFalse(CI->getContext());

	case ICmpInst::ICMP_NE:
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE:
	return ConstantInt::getTrue(CI->getContext());

	// LHS is non-negative. If RHS is negative then LHS >s LHS. If RHS
	// is non-negative then LHS <s RHS.
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE:
	return CI->getValue().isNegative() ?
	ConstantInt::getTrue(CI->getContext()) :
	ConstantInt::getFalse(CI->getContext());

	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE:
	return CI->getValue().isNegative() ?
	ConstantInt::getFalse(CI->getContext()) :
	ConstantInt::getTrue(CI->getContext());
	}
	}
	}
	}

	if (isa<SExtInst>(LHS)) {
	// Turn icmp (sext X), (sext Y) into a compare of X and Y if they have the
	// same type.
	if (SExtInst *RI = dyn_cast<SExtInst>(RHS)) {
	if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
	// Compare X and Y. Note that the predicate does not change.
	if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
	Q, MaxRecurse-1))
	return V;
	}
	// Turn icmp (sext X), Cst into a compare of X and Cst if Cst is extended
	// too. If not, then try to deduce the result of the comparison.
	else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
	// Compute the constant that would happen if we truncated to SrcTy then
	// reextended to DstTy.
	Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy);
	Constant *RExt = ConstantExpr::getCast(CastInst::SExt, Trunc, DstTy);

	// If the re-extended constant didn't change then this is effectively
	// also a case of comparing two sign-extended values.
	if (RExt == CI && MaxRecurse)
	if (Value *V = SimplifyICmpInst(Pred, SrcOp, Trunc, Q, MaxRecurse-1))
	return V;

	// Otherwise the upper bits of LHS are all equal, while RHS has varying
	// bits there. Use this to work out the result of the comparison.
	if (RExt != CI) {
	switch (Pred) {
	default: llvm_unreachable("Unknown ICmp predicate!");
	case ICmpInst::ICMP_EQ:
	return ConstantInt::getFalse(CI->getContext());
	case ICmpInst::ICMP_NE:
	return ConstantInt::getTrue(CI->getContext());

	// If RHS is non-negative then LHS <s RHS. If RHS is negative then
	// LHS >s RHS.
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE:
	return CI->getValue().isNegative() ?
	ConstantInt::getTrue(CI->getContext()) :
	ConstantInt::getFalse(CI->getContext());
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE:
	return CI->getValue().isNegative() ?
	ConstantInt::getFalse(CI->getContext()) :
	ConstantInt::getTrue(CI->getContext());

	// If LHS is non-negative then LHS <u RHS. If LHS is negative then
	// LHS >u RHS.
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE:
	// Comparison is true iff the LHS <s 0.
	if (MaxRecurse)
	if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SLT, SrcOp,
	Constant::getNullValue(SrcTy),
	Q, MaxRecurse-1))
	return V;
	break;
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE:
	// Comparison is true iff the LHS >=s 0.
	if (MaxRecurse)
	if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SGE, SrcOp,
	Constant::getNullValue(SrcTy),
	Q, MaxRecurse-1))
	return V;
	break;
	}
	}
	}
	}
	}

	// icmp eq\|ne X, Y -> false\|true if X != Y
	if (ICmpInst::isEquality(Pred) &&
	isKnownNonEqual(LHS, RHS, Q.DL, Q.AC, Q.CxtI, Q.DT, Q.IIQ.UseInstrInfo)) {
	return Pred == ICmpInst::ICMP_NE ? getTrue(ITy) : getFalse(ITy);
	}

	if (Value *V = simplifyICmpWithBinOp(Pred, LHS, RHS, Q, MaxRecurse))
	return V;

	if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse))
	return V;

	// Simplify comparisons of related pointers using a powerful, recursive
	// GEP-walk when we have target data available..
	if (LHS->getType()->isPointerTy())
	if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.AC, Q.CxtI,
	Q.IIQ, LHS, RHS))
	return C;
	if (auto *CLHS = dyn_cast<PtrToIntOperator>(LHS))
	if (auto *CRHS = dyn_cast<PtrToIntOperator>(RHS))
	if (Q.DL.getTypeSizeInBits(CLHS->getPointerOperandType()) ==
	Q.DL.getTypeSizeInBits(CLHS->getType()) &&
	Q.DL.getTypeSizeInBits(CRHS->getPointerOperandType()) ==
	Q.DL.getTypeSizeInBits(CRHS->getType()))
	if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.AC, Q.CxtI,
	Q.IIQ, CLHS->getPointerOperand(),
	CRHS->getPointerOperand()))
	return C;

	if (GetElementPtrInst *GLHS = dyn_cast<GetElementPtrInst>(LHS)) {
	if (GEPOperator *GRHS = dyn_cast<GEPOperator>(RHS)) {
	if (GLHS->getPointerOperand() == GRHS->getPointerOperand() &&
	GLHS->hasAllConstantIndices() && GRHS->hasAllConstantIndices() &&
	(ICmpInst::isEquality(Pred) \|\|
	(GLHS->isInBounds() && GRHS->isInBounds() &&
	Pred == ICmpInst::getSignedPredicate(Pred)))) {
	// The bases are equal and the indices are constant. Build a constant
	// expression GEP with the same indices and a null base pointer to see
	// what constant folding can make out of it.
	Constant *Null = Constant::getNullValue(GLHS->getPointerOperandType());
	SmallVector<Value *, 4> IndicesLHS(GLHS->idx_begin(), GLHS->idx_end());
	Constant *NewLHS = ConstantExpr::getGetElementPtr(
	GLHS->getSourceElementType(), Null, IndicesLHS);

	SmallVector<Value *, 4> IndicesRHS(GRHS->idx_begin(), GRHS->idx_end());
	Constant *NewRHS = ConstantExpr::getGetElementPtr(
	GLHS->getSourceElementType(), Null, IndicesRHS);
	return ConstantExpr::getICmp(Pred, NewLHS, NewRHS);
	}
	}
	}

	// If the comparison is with the result of a select instruction, check whether
	// comparing with either branch of the select always yields the same value.
	if (isa<SelectInst>(LHS) \|\| isa<SelectInst>(RHS))
	if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
	return V;

	// If the comparison is with the result of a phi instruction, check whether
	// doing the compare with each incoming phi value yields a common result.
	if (isa<PHINode>(LHS) \|\| isa<PHINode>(RHS))
	if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
	return V;

	return nullptr;
	}

	Value llvm::SimplifyICmpInst(unsigned Predicate, Value LHS, Value *RHS,
	const SimplifyQuery &Q) {
	return ::SimplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
	}

	/// Given operands for an FCmpInst, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyFCmpInst(unsigned Predicate, Value LHS, Value *RHS,
	FastMathFlags FMF, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
	assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!");

	if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
	if (Constant *CRHS = dyn_cast<Constant>(RHS))
	return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI);

	// If we have a constant, make sure it is on the RHS.
	std::swap(LHS, RHS);
	Pred = CmpInst::getSwappedPredicate(Pred);
	}

	// Fold trivial predicates.
	Type *RetTy = GetCompareTy(LHS);
	if (Pred == FCmpInst::FCMP_FALSE)
	return getFalse(RetTy);
	if (Pred == FCmpInst::FCMP_TRUE)
	return getTrue(RetTy);

	// Fold (un)ordered comparison if we can determine there are no NaNs.
	if (Pred == FCmpInst::FCMP_UNO \|\| Pred == FCmpInst::FCMP_ORD)
	if (FMF.noNaNs() \|\|
	(isKnownNeverNaN(LHS, Q.TLI) && isKnownNeverNaN(RHS, Q.TLI)))
	return ConstantInt::get(RetTy, Pred == FCmpInst::FCMP_ORD);

	// NaN is unordered; NaN is not ordered.
	assert((FCmpInst::isOrdered(Pred) \|\| FCmpInst::isUnordered(Pred)) &&
	"Comparison must be either ordered or unordered");
	if (match(RHS, m_NaN()))
	return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred));

	// fcmp pred x, undef and fcmp pred undef, x
	// fold to true if unordered, false if ordered
	if (isa<UndefValue>(LHS) \|\| isa<UndefValue>(RHS)) {
	// Choosing NaN for the undef will always make unordered comparison succeed
	// and ordered comparison fail.
	return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred));
	}

	// fcmp x,x -> true/false. Not all compares are foldable.
	if (LHS == RHS) {
	if (CmpInst::isTrueWhenEqual(Pred))
	return getTrue(RetTy);
	if (CmpInst::isFalseWhenEqual(Pred))
	return getFalse(RetTy);
	}

	// Handle fcmp with constant RHS.
	// TODO: Use match with a specific FP value, so these work with vectors with
	// undef lanes.
	const APFloat *C;
	if (match(RHS, m_APFloat(C))) {
	// Check whether the constant is an infinity.
	if (C->isInfinity()) {
	if (C->isNegative()) {
	switch (Pred) {
	case FCmpInst::FCMP_OLT:
	// No value is ordered and less than negative infinity.
	return getFalse(RetTy);
	case FCmpInst::FCMP_UGE:
	// All values are unordered with or at least negative infinity.
	return getTrue(RetTy);
	default:
	break;
	}
	} else {
	switch (Pred) {
	case FCmpInst::FCMP_OGT:
	// No value is ordered and greater than infinity.
	return getFalse(RetTy);
	case FCmpInst::FCMP_ULE:
	// All values are unordered with and at most infinity.
	return getTrue(RetTy);
	default:
	break;
	}
	}
	}
	if (C->isNegative() && !C->isNegZero()) {
	assert(!C->isNaN() && "Unexpected NaN constant!");
	// TODO: We can catch more cases by using a range check rather than
	// relying on CannotBeOrderedLessThanZero.
	switch (Pred) {
	case FCmpInst::FCMP_UGE:
	case FCmpInst::FCMP_UGT:
	case FCmpInst::FCMP_UNE:
	// (X >= 0) implies (X > C) when (C < 0)
	if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
	return getTrue(RetTy);
	break;
	case FCmpInst::FCMP_OEQ:
	case FCmpInst::FCMP_OLE:
	case FCmpInst::FCMP_OLT:
	// (X >= 0) implies !(X < C) when (C < 0)
	if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
	return getFalse(RetTy);
	break;
	default:
	break;
	}
	}

	// Check comparison of [minnum/maxnum with constant] with other constant.
	const APFloat *C2;
	if ((match(LHS, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_APFloat(C2))) &&
	C2->compare(*C) == APFloat::cmpLessThan) \|\|
	(match(LHS, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_APFloat(C2))) &&
	C2->compare(*C) == APFloat::cmpGreaterThan)) {
	bool IsMaxNum =
	cast<IntrinsicInst>(LHS)->getIntrinsicID() == Intrinsic::maxnum;
	// The ordered relationship and minnum/maxnum guarantee that we do not
	// have NaN constants, so ordered/unordered preds are handled the same.
	switch (Pred) {
	case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_UEQ:
	// minnum(X, LesserC) == C --> false
	// maxnum(X, GreaterC) == C --> false
	return getFalse(RetTy);
	case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_UNE:
	// minnum(X, LesserC) != C --> true
	// maxnum(X, GreaterC) != C --> true
	return getTrue(RetTy);
	case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_UGE:
	case FCmpInst::FCMP_OGT: case FCmpInst::FCMP_UGT:
	// minnum(X, LesserC) >= C --> false
	// minnum(X, LesserC) > C --> false
	// maxnum(X, GreaterC) >= C --> true
	// maxnum(X, GreaterC) > C --> true
	return ConstantInt::get(RetTy, IsMaxNum);
	case FCmpInst::FCMP_OLE: case FCmpInst::FCMP_ULE:
	case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_ULT:
	// minnum(X, LesserC) <= C --> true
	// minnum(X, LesserC) < C --> true
	// maxnum(X, GreaterC) <= C --> false
	// maxnum(X, GreaterC) < C --> false
	return ConstantInt::get(RetTy, !IsMaxNum);
	default:
	// TRUE/FALSE/ORD/UNO should be handled before this.
	llvm_unreachable("Unexpected fcmp predicate");
	}
	}
	}

	if (match(RHS, m_AnyZeroFP())) {
	switch (Pred) {
	case FCmpInst::FCMP_OGE:
	case FCmpInst::FCMP_ULT:
	// Positive or zero X >= 0.0 --> true
	// Positive or zero X < 0.0 --> false
	if ((FMF.noNaNs() \|\| isKnownNeverNaN(LHS, Q.TLI)) &&
	CannotBeOrderedLessThanZero(LHS, Q.TLI))
	return Pred == FCmpInst::FCMP_OGE ? getTrue(RetTy) : getFalse(RetTy);
	break;
	case FCmpInst::FCMP_UGE:
	case FCmpInst::FCMP_OLT:
	// Positive or zero or nan X >= 0.0 --> true
	// Positive or zero or nan X < 0.0 --> false
	if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
	return Pred == FCmpInst::FCMP_UGE ? getTrue(RetTy) : getFalse(RetTy);
	break;
	default:
	break;
	}
	}

	// If the comparison is with the result of a select instruction, check whether
	// comparing with either branch of the select always yields the same value.
	if (isa<SelectInst>(LHS) \|\| isa<SelectInst>(RHS))
	if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
	return V;

	// If the comparison is with the result of a phi instruction, check whether
	// doing the compare with each incoming phi value yields a common result.
	if (isa<PHINode>(LHS) \|\| isa<PHINode>(RHS))
	if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
	return V;

	return nullptr;
	}

	Value llvm::SimplifyFCmpInst(unsigned Predicate, Value LHS, Value *RHS,
	FastMathFlags FMF, const SimplifyQuery &Q) {
	return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
	}

	/// See if V simplifies when its operand Op is replaced with RepOp.
	static const Value SimplifyWithOpReplaced(Value V, Value Op, Value RepOp,
	const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	// Trivial replacement.
	if (V == Op)
	return RepOp;

	// We cannot replace a constant, and shouldn't even try.
	if (isa<Constant>(Op))
	return nullptr;

	auto *I = dyn_cast<Instruction>(V);
	if (!I)
	return nullptr;

	// If this is a binary operator, try to simplify it with the replaced op.
	if (auto *B = dyn_cast<BinaryOperator>(I)) {
	// Consider:
	// %cmp = icmp eq i32 %x, 2147483647
	// %add = add nsw i32 %x, 1
	// %sel = select i1 %cmp, i32 -2147483648, i32 %add
	//
	// We can't replace %sel with %add unless we strip away the flags.
	if (isa<OverflowingBinaryOperator>(B))
	if (Q.IIQ.hasNoSignedWrap(B) \|\| Q.IIQ.hasNoUnsignedWrap(B))
	return nullptr;
	if (isa<PossiblyExactOperator>(B) && Q.IIQ.isExact(B))
	return nullptr;

	if (MaxRecurse) {
	if (B->getOperand(0) == Op)
	return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), Q,
	MaxRecurse - 1);
	if (B->getOperand(1) == Op)
	return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, Q,
	MaxRecurse - 1);
	}
	}

	// Same for CmpInsts.
	if (CmpInst *C = dyn_cast<CmpInst>(I)) {
	if (MaxRecurse) {
	if (C->getOperand(0) == Op)
	return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), Q,
	MaxRecurse - 1);
	if (C->getOperand(1) == Op)
	return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, Q,
	MaxRecurse - 1);
	}
	}

	// Same for GEPs.
	if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
	if (MaxRecurse) {
	SmallVector<Value *, 8> NewOps(GEP->getNumOperands());
	transform(GEP->operands(), NewOps.begin(),
	[&](Value *V) { return V == Op ? RepOp : V; });
	return SimplifyGEPInst(GEP->getSourceElementType(), NewOps, Q,
	MaxRecurse - 1);
	}
	}

	// TODO: We could hand off more cases to instsimplify here.

	// If all operands are constant after substituting Op for RepOp then we can
	// constant fold the instruction.
	if (Constant *CRepOp = dyn_cast<Constant>(RepOp)) {
	// Build a list of all constant operands.
	SmallVector<Constant *, 8> ConstOps;
	for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
	if (I->getOperand(i) == Op)
	ConstOps.push_back(CRepOp);
	else if (Constant *COp = dyn_cast<Constant>(I->getOperand(i)))
	ConstOps.push_back(COp);
	else
	break;
	}

	// All operands were constants, fold it.
	if (ConstOps.size() == I->getNumOperands()) {
	if (CmpInst *C = dyn_cast<CmpInst>(I))
	return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0],
	ConstOps[1], Q.DL, Q.TLI);

	if (LoadInst *LI = dyn_cast<LoadInst>(I))
	if (!LI->isVolatile())
	return ConstantFoldLoadFromConstPtr(ConstOps[0], LI->getType(), Q.DL);

	return ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI);
	}
	}

	return nullptr;
	}

	/// Try to simplify a select instruction when its condition operand is an
	/// integer comparison where one operand of the compare is a constant.
	static Value simplifySelectBitTest(Value TrueVal, Value FalseVal, Value X,
	const APInt *Y, bool TrueWhenUnset) {
	const APInt *C;

	// (X & Y) == 0 ? X & ~Y : X --> X
	// (X & Y) != 0 ? X & ~Y : X --> X & ~Y
	if (FalseVal == X && match(TrueVal, m_And(m_Specific(X), m_APInt(C))) &&
	Y == ~C)
	return TrueWhenUnset ? FalseVal : TrueVal;

	// (X & Y) == 0 ? X : X & ~Y --> X & ~Y
	// (X & Y) != 0 ? X : X & ~Y --> X
	if (TrueVal == X && match(FalseVal, m_And(m_Specific(X), m_APInt(C))) &&
	Y == ~C)
	return TrueWhenUnset ? FalseVal : TrueVal;

	if (Y->isPowerOf2()) {
	// (X & Y) == 0 ? X \| Y : X --> X \| Y
	// (X & Y) != 0 ? X \| Y : X --> X
	if (FalseVal == X && match(TrueVal, m_Or(m_Specific(X), m_APInt(C))) &&
	Y == C)
	return TrueWhenUnset ? TrueVal : FalseVal;

	// (X & Y) == 0 ? X : X \| Y --> X
	// (X & Y) != 0 ? X : X \| Y --> X \| Y
	if (TrueVal == X && match(FalseVal, m_Or(m_Specific(X), m_APInt(C))) &&
	Y == C)
	return TrueWhenUnset ? TrueVal : FalseVal;
	}

	return nullptr;
	}

	/// An alternative way to test if a bit is set or not uses sgt/slt instead of
	/// eq/ne.
	static Value simplifySelectWithFakeICmpEq(Value CmpLHS, Value *CmpRHS,
	ICmpInst::Predicate Pred,
	Value TrueVal, Value FalseVal) {
	Value *X;
	APInt Mask;
	if (!decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, X, Mask))
	return nullptr;

	return simplifySelectBitTest(TrueVal, FalseVal, X, &Mask,
	Pred == ICmpInst::ICMP_EQ);
	}

	/// Try to simplify a select instruction when its condition operand is an
	/// integer comparison.
	static Value simplifySelectWithICmpCond(Value CondVal, Value *TrueVal,
	Value *FalseVal, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	ICmpInst::Predicate Pred;
	Value CmpLHS, CmpRHS;
	if (!match(CondVal, m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS))))
	return nullptr;

	if (ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero())) {
	Value *X;
	const APInt *Y;
	if (match(CmpLHS, m_And(m_Value(X), m_APInt(Y))))
	if (Value *V = simplifySelectBitTest(TrueVal, FalseVal, X, Y,
	Pred == ICmpInst::ICMP_EQ))
	return V;

	// Test for a bogus zero-shift-guard-op around funnel-shift or rotate.
	Value *ShAmt;
	auto isFsh = m_CombineOr(m_Intrinsic<Intrinsic::fshl>(m_Value(X), m_Value(),
	m_Value(ShAmt)),
	m_Intrinsic<Intrinsic::fshr>(m_Value(), m_Value(X),
	m_Value(ShAmt)));
	// (ShAmt == 0) ? fshl(X, *, ShAmt) : X --> X
	// (ShAmt == 0) ? fshr(*, X, ShAmt) : X --> X
	if (match(TrueVal, isFsh) && FalseVal == X && CmpLHS == ShAmt &&
	Pred == ICmpInst::ICMP_EQ)
	return X;
	// (ShAmt != 0) ? X : fshl(X, *, ShAmt) --> X
	// (ShAmt != 0) ? X : fshr(*, X, ShAmt) --> X
	if (match(FalseVal, isFsh) && TrueVal == X && CmpLHS == ShAmt &&
	Pred == ICmpInst::ICMP_NE)
	return X;

	// Test for a zero-shift-guard-op around rotates. These are used to
	// avoid UB from oversized shifts in raw IR rotate patterns, but the
	// intrinsics do not have that problem.
	// We do not allow this transform for the general funnel shift case because
	// that would not preserve the poison safety of the original code.
	auto isRotate = m_CombineOr(m_Intrinsic<Intrinsic::fshl>(m_Value(X),
	m_Deferred(X),
	m_Value(ShAmt)),
	m_Intrinsic<Intrinsic::fshr>(m_Value(X),
	m_Deferred(X),
	m_Value(ShAmt)));
	// (ShAmt != 0) ? fshl(X, X, ShAmt) : X --> fshl(X, X, ShAmt)
	// (ShAmt != 0) ? fshr(X, X, ShAmt) : X --> fshr(X, X, ShAmt)
	if (match(TrueVal, isRotate) && FalseVal == X && CmpLHS == ShAmt &&
	Pred == ICmpInst::ICMP_NE)
	return TrueVal;
	// (ShAmt == 0) ? X : fshl(X, X, ShAmt) --> fshl(X, X, ShAmt)
	// (ShAmt == 0) ? X : fshr(X, X, ShAmt) --> fshr(X, X, ShAmt)
	if (match(FalseVal, isRotate) && TrueVal == X && CmpLHS == ShAmt &&
	Pred == ICmpInst::ICMP_EQ)
	return FalseVal;
	}

	// Check for other compares that behave like bit test.
	if (Value *V = simplifySelectWithFakeICmpEq(CmpLHS, CmpRHS, Pred,
	TrueVal, FalseVal))
	return V;

	// If we have an equality comparison, then we know the value in one of the
	// arms of the select. See if substituting this value into the arm and
	// simplifying the result yields the same value as the other arm.
	if (Pred == ICmpInst::ICMP_EQ) {
	if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
	TrueVal \|\|
	SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
	TrueVal)
	return FalseVal;
	if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
	FalseVal \|\|
	SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
	FalseVal)
	return FalseVal;
	} else if (Pred == ICmpInst::ICMP_NE) {
	if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
	FalseVal \|\|
	SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
	FalseVal)
	return TrueVal;
	if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
	TrueVal \|\|
	SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
	TrueVal)
	return TrueVal;
	}

	return nullptr;
	}

	/// Try to simplify a select instruction when its condition operand is a
	/// floating-point comparison.
	static Value simplifySelectWithFCmp(Value Cond, Value T, Value F) {
	FCmpInst::Predicate Pred;
	if (!match(Cond, m_FCmp(Pred, m_Specific(T), m_Specific(F))) &&
	!match(Cond, m_FCmp(Pred, m_Specific(F), m_Specific(T))))
	return nullptr;

	// TODO: The transform may not be valid with -0.0. An incomplete way of
	// testing for that possibility is to check if at least one operand is a
	// non-zero constant.
	const APFloat *C;
	if ((match(T, m_APFloat(C)) && C->isNonZero()) \|\|
	(match(F, m_APFloat(C)) && C->isNonZero())) {
	// (T == F) ? T : F --> F
	// (F == T) ? T : F --> F
	if (Pred == FCmpInst::FCMP_OEQ)
	return F;

	// (T != F) ? T : F --> T
	// (F != T) ? T : F --> T
	if (Pred == FCmpInst::FCMP_UNE)
	return T;
	}

	return nullptr;
	}

	/// Given operands for a SelectInst, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifySelectInst(Value Cond, Value TrueVal, Value FalseVal,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (auto *CondC = dyn_cast<Constant>(Cond)) {
	if (auto *TrueC = dyn_cast<Constant>(TrueVal))
	if (auto *FalseC = dyn_cast<Constant>(FalseVal))
	return ConstantFoldSelectInstruction(CondC, TrueC, FalseC);

	// select undef, X, Y -> X or Y
	if (isa<UndefValue>(CondC))
	return isa<Constant>(FalseVal) ? FalseVal : TrueVal;

	// TODO: Vector constants with undef elements don't simplify.

	// select true, X, Y -> X
	if (CondC->isAllOnesValue())
	return TrueVal;
	// select false, X, Y -> Y
	if (CondC->isNullValue())
	return FalseVal;
	}

	// select ?, X, X -> X
	if (TrueVal == FalseVal)
	return TrueVal;

	if (isa<UndefValue>(TrueVal)) // select ?, undef, X -> X
	return FalseVal;
	if (isa<UndefValue>(FalseVal)) // select ?, X, undef -> X
	return TrueVal;

	if (Value *V =
	simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
	return V;

	if (Value *V = simplifySelectWithFCmp(Cond, TrueVal, FalseVal))
	return V;

	if (Value *V = foldSelectWithBinaryOp(Cond, TrueVal, FalseVal))
	return V;

	Optional<bool> Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL);
	if (Imp)
	return *Imp ? TrueVal : FalseVal;

	return nullptr;
	}

	Value llvm::SimplifySelectInst(Value Cond, Value TrueVal, Value FalseVal,
	const SimplifyQuery &Q) {
	return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Q, RecursionLimit);
	}

	/// Given operands for an GetElementPtrInst, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyGEPInst(Type SrcTy, ArrayRef<Value *> Ops,
	const SimplifyQuery &Q, unsigned) {
	// The type of the GEP pointer operand.
	unsigned AS =
	cast<PointerType>(Ops[0]->getType()->getScalarType())->getAddressSpace();

	// getelementptr P -> P.
	if (Ops.size() == 1)
	return Ops[0];

	// Compute the (pointer) type returned by the GEP instruction.
	Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Ops.slice(1));
	Type *GEPTy = PointerType::get(LastType, AS);
	if (VectorType *VT = dyn_cast<VectorType>(Ops[0]->getType()))
	GEPTy = VectorType::get(GEPTy, VT->getNumElements());
	else if (VectorType *VT = dyn_cast<VectorType>(Ops[1]->getType()))
	GEPTy = VectorType::get(GEPTy, VT->getNumElements());

	if (isa<UndefValue>(Ops[0]))
	return UndefValue::get(GEPTy);

	if (Ops.size() == 2) {
	// getelementptr P, 0 -> P.
	if (match(Ops[1], m_Zero()) && Ops[0]->getType() == GEPTy)
	return Ops[0];

	Type *Ty = SrcTy;
	if (Ty->isSized()) {
	Value *P;
	uint64_t C;
	uint64_t TyAllocSize = Q.DL.getTypeAllocSize(Ty);
	// getelementptr P, N -> P if P points to a type of zero size.
	if (TyAllocSize == 0 && Ops[0]->getType() == GEPTy)
	return Ops[0];

	// The following transforms are only safe if the ptrtoint cast
	// doesn't truncate the pointers.
	if (Ops[1]->getType()->getScalarSizeInBits() ==
	Q.DL.getIndexSizeInBits(AS)) {
	auto PtrToIntOrZero = [GEPTy](Value P) -> Value {
	if (match(P, m_Zero()))
	return Constant::getNullValue(GEPTy);
	Value *Temp;
	if (match(P, m_PtrToInt(m_Value(Temp))))
	if (Temp->getType() == GEPTy)
	return Temp;
	return nullptr;
	};

	// getelementptr V, (sub P, V) -> P if P points to a type of size 1.
	if (TyAllocSize == 1 &&
	match(Ops[1], m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0])))))
	if (Value *R = PtrToIntOrZero(P))
	return R;

	// getelementptr V, (ashr (sub P, V), C) -> Q
	// if P points to a type of size 1 << C.
	if (match(Ops[1],
	m_AShr(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
	m_ConstantInt(C))) &&
	TyAllocSize == 1ULL << C)
	if (Value *R = PtrToIntOrZero(P))
	return R;

	// getelementptr V, (sdiv (sub P, V), C) -> Q
	// if P points to a type of size C.
	if (match(Ops[1],
	m_SDiv(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
	m_SpecificInt(TyAllocSize))))
	if (Value *R = PtrToIntOrZero(P))
	return R;
	}
	}
	}

	if (Q.DL.getTypeAllocSize(LastType) == 1 &&
	all_of(Ops.slice(1).drop_back(1),
	[](Value *Idx) { return match(Idx, m_Zero()); })) {
	unsigned IdxWidth =
	Q.DL.getIndexSizeInBits(Ops[0]->getType()->getPointerAddressSpace());
	if (Q.DL.getTypeSizeInBits(Ops.back()->getType()) == IdxWidth) {
	APInt BasePtrOffset(IdxWidth, 0);
	Value *StrippedBasePtr =
	Ops[0]->stripAndAccumulateInBoundsConstantOffsets(Q.DL,
	BasePtrOffset);

	// gep (gep V, C), (sub 0, V) -> C
	if (match(Ops.back(),
	m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr))))) {
	auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset);
	return ConstantExpr::getIntToPtr(CI, GEPTy);
	}
	// gep (gep V, C), (xor V, -1) -> C-1
	if (match(Ops.back(),
	m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes()))) {
	auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset - 1);
	return ConstantExpr::getIntToPtr(CI, GEPTy);
	}
	}
	}

	// Check to see if this is constant foldable.
	if (!all_of(Ops, [](Value *V) { return isa<Constant>(V); }))
	return nullptr;

	auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]),
	Ops.slice(1));
	if (auto *CEFolded = ConstantFoldConstant(CE, Q.DL))
	return CEFolded;
	return CE;
	}

	Value llvm::SimplifyGEPInst(Type SrcTy, ArrayRef<Value *> Ops,
	const SimplifyQuery &Q) {
	return ::SimplifyGEPInst(SrcTy, Ops, Q, RecursionLimit);
	}

	/// Given operands for an InsertValueInst, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyInsertValueInst(Value Agg, Value *Val,
	ArrayRef<unsigned> Idxs, const SimplifyQuery &Q,
	unsigned) {
	if (Constant *CAgg = dyn_cast<Constant>(Agg))
	if (Constant *CVal = dyn_cast<Constant>(Val))
	return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs);

	// insertvalue x, undef, n -> x
	if (match(Val, m_Undef()))
	return Agg;

	// insertvalue x, (extractvalue y, n), n
	if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val))
	if (EV->getAggregateOperand()->getType() == Agg->getType() &&
	EV->getIndices() == Idxs) {
	// insertvalue undef, (extractvalue y, n), n -> y
	if (match(Agg, m_Undef()))
	return EV->getAggregateOperand();

	// insertvalue y, (extractvalue y, n), n -> y
	if (Agg == EV->getAggregateOperand())
	return Agg;
	}

	return nullptr;
	}

	Value llvm::SimplifyInsertValueInst(Value Agg, Value *Val,
	ArrayRef<unsigned> Idxs,
	const SimplifyQuery &Q) {
	return ::SimplifyInsertValueInst(Agg, Val, Idxs, Q, RecursionLimit);
	}

	Value llvm::SimplifyInsertElementInst(Value Vec, Value Val, Value Idx,
	const SimplifyQuery &Q) {
	// Try to constant fold.
	auto *VecC = dyn_cast<Constant>(Vec);
	auto *ValC = dyn_cast<Constant>(Val);
	auto *IdxC = dyn_cast<Constant>(Idx);
	if (VecC && ValC && IdxC)
	return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC);

	// Fold into undef if index is out of bounds.
	if (auto *CI = dyn_cast<ConstantInt>(Idx)) {
	uint64_t NumElements = cast<VectorType>(Vec->getType())->getNumElements();
	if (CI->uge(NumElements))
	return UndefValue::get(Vec->getType());
	}

	// If index is undef, it might be out of bounds (see above case)
	if (isa<UndefValue>(Idx))
	return UndefValue::get(Vec->getType());

	// Inserting an undef scalar? Assume it is the same value as the existing
	// vector element.
	if (isa<UndefValue>(Val))
	return Vec;

	// If we are extracting a value from a vector, then inserting it into the same
	// place, that's the input vector:
	// insertelt Vec, (extractelt Vec, Idx), Idx --> Vec
	if (match(Val, m_ExtractElement(m_Specific(Vec), m_Specific(Idx))))
	return Vec;

	return nullptr;
	}

	/// Given operands for an ExtractValueInst, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyExtractValueInst(Value Agg, ArrayRef<unsigned> Idxs,
	const SimplifyQuery &, unsigned) {
	if (auto *CAgg = dyn_cast<Constant>(Agg))
	return ConstantFoldExtractValueInstruction(CAgg, Idxs);

	// extractvalue x, (insertvalue y, elt, n), n -> elt
	unsigned NumIdxs = Idxs.size();
	for (auto *IVI = dyn_cast<InsertValueInst>(Agg); IVI != nullptr;
	IVI = dyn_cast<InsertValueInst>(IVI->getAggregateOperand())) {
	ArrayRef<unsigned> InsertValueIdxs = IVI->getIndices();
	unsigned NumInsertValueIdxs = InsertValueIdxs.size();
	unsigned NumCommonIdxs = std::min(NumInsertValueIdxs, NumIdxs);
	if (InsertValueIdxs.slice(0, NumCommonIdxs) ==
	Idxs.slice(0, NumCommonIdxs)) {
	if (NumIdxs == NumInsertValueIdxs)
	return IVI->getInsertedValueOperand();
	break;
	}
	}

	return nullptr;
	}

	Value llvm::SimplifyExtractValueInst(Value Agg, ArrayRef<unsigned> Idxs,
	const SimplifyQuery &Q) {
	return ::SimplifyExtractValueInst(Agg, Idxs, Q, RecursionLimit);
	}

	/// Given operands for an ExtractElementInst, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyExtractElementInst(Value Vec, Value *Idx, const SimplifyQuery &,
	unsigned) {
	if (auto *CVec = dyn_cast<Constant>(Vec)) {
	if (auto *CIdx = dyn_cast<Constant>(Idx))
	return ConstantFoldExtractElementInstruction(CVec, CIdx);

	// The index is not relevant if our vector is a splat.
	if (auto *Splat = CVec->getSplatValue())
	return Splat;

	if (isa<UndefValue>(Vec))
	return UndefValue::get(Vec->getType()->getVectorElementType());
	}

	// If extracting a specified index from the vector, see if we can recursively
	// find a previously computed scalar that was inserted into the vector.
	if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) {
	if (IdxC->getValue().uge(Vec->getType()->getVectorNumElements()))
	// definitely out of bounds, thus undefined result
	return UndefValue::get(Vec->getType()->getVectorElementType());
	if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue()))
	return Elt;
	}

	// An undef extract index can be arbitrarily chosen to be an out-of-range
	// index value, which would result in the instruction being undef.
	if (isa<UndefValue>(Idx))
	return UndefValue::get(Vec->getType()->getVectorElementType());

	return nullptr;
	}

	Value llvm::SimplifyExtractElementInst(Value Vec, Value *Idx,
	const SimplifyQuery &Q) {
	return ::SimplifyExtractElementInst(Vec, Idx, Q, RecursionLimit);
	}

	/// See if we can fold the given phi. If not, returns null.
	static Value SimplifyPHINode(PHINode PN, const SimplifyQuery &Q) {
	// If all of the PHI's incoming values are the same then replace the PHI node
	// with the common value.
	Value *CommonValue = nullptr;
	bool HasUndefInput = false;
	for (Value *Incoming : PN->incoming_values()) {
	// If the incoming value is the phi node itself, it can safely be skipped.
	if (Incoming == PN) continue;
	if (isa<UndefValue>(Incoming)) {
	// Remember that we saw an undef value, but otherwise ignore them.
	HasUndefInput = true;
	continue;
	}
	if (CommonValue && Incoming != CommonValue)
	return nullptr; // Not the same, bail out.
	CommonValue = Incoming;
	}

	// If CommonValue is null then all of the incoming values were either undef or
	// equal to the phi node itself.
	if (!CommonValue)
	return UndefValue::get(PN->getType());

	// If we have a PHI node like phi(X, undef, X), where X is defined by some
	// instruction, we cannot return X as the result of the PHI node unless it
	// dominates the PHI block.
	if (HasUndefInput)
	return valueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : nullptr;

	return CommonValue;
	}

	static Value SimplifyCastInst(unsigned CastOpc, Value Op,
	Type *Ty, const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (auto *C = dyn_cast<Constant>(Op))
	return ConstantFoldCastOperand(CastOpc, C, Ty, Q.DL);

	if (auto *CI = dyn_cast<CastInst>(Op)) {
	auto *Src = CI->getOperand(0);
	Type *SrcTy = Src->getType();
	Type *MidTy = CI->getType();
	Type *DstTy = Ty;
	if (Src->getType() == Ty) {
	auto FirstOp = static_cast<Instruction::CastOps>(CI->getOpcode());
	auto SecondOp = static_cast<Instruction::CastOps>(CastOpc);
	Type *SrcIntPtrTy =
	SrcTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(SrcTy) : nullptr;
	Type *MidIntPtrTy =
	MidTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(MidTy) : nullptr;
	Type *DstIntPtrTy =
	DstTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(DstTy) : nullptr;
	if (CastInst::isEliminableCastPair(FirstOp, SecondOp, SrcTy, MidTy, DstTy,
	SrcIntPtrTy, MidIntPtrTy,
	DstIntPtrTy) == Instruction::BitCast)
	return Src;
	}
	}

	// bitcast x -> x
	if (CastOpc == Instruction::BitCast)
	if (Op->getType() == Ty)
	return Op;

	return nullptr;
	}

	Value llvm::SimplifyCastInst(unsigned CastOpc, Value Op, Type *Ty,
	const SimplifyQuery &Q) {
	return ::SimplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit);
	}

	/// For the given destination element of a shuffle, peek through shuffles to
	/// match a root vector source operand that contains that element in the same
	/// vector lane (ie, the same mask index), so we can eliminate the shuffle(s).
	static Value foldIdentityShuffles(int DestElt, Value Op0, Value *Op1,
	int MaskVal, Value *RootVec,
	unsigned MaxRecurse) {
	if (!MaxRecurse--)
	return nullptr;

	// Bail out if any mask value is undefined. That kind of shuffle may be
	// simplified further based on demanded bits or other folds.
	if (MaskVal == -1)
	return nullptr;

	// The mask value chooses which source operand we need to look at next.
	int InVecNumElts = Op0->getType()->getVectorNumElements();
	int RootElt = MaskVal;
	Value *SourceOp = Op0;
	if (MaskVal >= InVecNumElts) {
	RootElt = MaskVal - InVecNumElts;
	SourceOp = Op1;
	}

	// If the source operand is a shuffle itself, look through it to find the
	// matching root vector.
	if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) {
	return foldIdentityShuffles(
	DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1),
	SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse);
	}

	// TODO: Look through bitcasts? What if the bitcast changes the vector element
	// size?

	// The source operand is not a shuffle. Initialize the root vector value for
	// this shuffle if that has not been done yet.
	if (!RootVec)
	RootVec = SourceOp;

	// Give up as soon as a source operand does not match the existing root value.
	if (RootVec != SourceOp)
	return nullptr;

	// The element must be coming from the same lane in the source vector
	// (although it may have crossed lanes in intermediate shuffles).
	if (RootElt != DestElt)
	return nullptr;

	return RootVec;
	}

	static Value SimplifyShuffleVectorInst(Value Op0, Value Op1, Constant Mask,
	Type *RetTy, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	if (isa<UndefValue>(Mask))
	return UndefValue::get(RetTy);

	Type *InVecTy = Op0->getType();
	unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
	unsigned InVecNumElts = InVecTy->getVectorNumElements();

	SmallVector<int, 32> Indices;
	ShuffleVectorInst::getShuffleMask(Mask, Indices);
	assert(MaskNumElts == Indices.size() &&
	"Size of Indices not same as number of mask elements?");

	// Canonicalization: If mask does not select elements from an input vector,
	// replace that input vector with undef.
	bool MaskSelects0 = false, MaskSelects1 = false;
	for (unsigned i = 0; i != MaskNumElts; ++i) {
	if (Indices[i] == -1)
	continue;
	if ((unsigned)Indices[i] < InVecNumElts)
	MaskSelects0 = true;
	else
	MaskSelects1 = true;
	}
	if (!MaskSelects0)
	Op0 = UndefValue::get(InVecTy);
	if (!MaskSelects1)
	Op1 = UndefValue::get(InVecTy);

	auto *Op0Const = dyn_cast<Constant>(Op0);
	auto *Op1Const = dyn_cast<Constant>(Op1);

	// If all operands are constant, constant fold the shuffle.
	if (Op0Const && Op1Const)
	return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);

	// Canonicalization: if only one input vector is constant, it shall be the
	// second one.
	if (Op0Const && !Op1Const) {
	std::swap(Op0, Op1);
	ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
	}

	// A shuffle of a splat is always the splat itself. Legal if the shuffle's
	// value type is same as the input vectors' type.
	if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op0))
	if (isa<UndefValue>(Op1) && RetTy == InVecTy &&
	OpShuf->getMask()->getSplatValue())
	return Op0;

	// Don't fold a shuffle with undef mask elements. This may get folded in a
	// better way using demanded bits or other analysis.
	// TODO: Should we allow this?
	if (find(Indices, -1) != Indices.end())
	return nullptr;

	// Check if every element of this shuffle can be mapped back to the
	// corresponding element of a single root vector. If so, we don't need this
	// shuffle. This handles simple identity shuffles as well as chains of
	// shuffles that may widen/narrow and/or move elements across lanes and back.
	Value *RootVec = nullptr;
	for (unsigned i = 0; i != MaskNumElts; ++i) {
	// Note that recursion is limited for each vector element, so if any element
	// exceeds the limit, this will fail to simplify.
	RootVec =
	foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse);

	// We can't replace a widening/narrowing shuffle with one of its operands.
	if (!RootVec \|\| RootVec->getType() != RetTy)
	return nullptr;
	}
	return RootVec;
	}

	/// Given operands for a ShuffleVectorInst, fold the result or return null.
	Value llvm::SimplifyShuffleVectorInst(Value Op0, Value Op1, Constant Mask,
	Type *RetTy, const SimplifyQuery &Q) {
	return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit);
	}

	static Constant *foldConstant(Instruction::UnaryOps Opcode,
	Value *&Op, const SimplifyQuery &Q) {
	if (auto *C = dyn_cast<Constant>(Op))
	return ConstantFoldUnaryOpOperand(Opcode, C, Q.DL);
	return nullptr;
	}

	/// Given the operand for an FNeg, see if we can fold the result. If not, this
	/// returns null.
	static Value simplifyFNegInst(Value Op, FastMathFlags FMF,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldConstant(Instruction::FNeg, Op, Q))
	return C;

	Value *X;
	// fneg (fneg X) ==> X
	if (match(Op, m_FNeg(m_Value(X))))
	return X;

	return nullptr;
	}

	Value llvm::SimplifyFNegInst(Value Op, FastMathFlags FMF,
	const SimplifyQuery &Q) {
	return ::simplifyFNegInst(Op, FMF, Q, RecursionLimit);
	}

	static Constant propagateNaN(Constant In) {
	// If the input is a vector with undef elements, just return a default NaN.
	if (!In->isNaN())
	return ConstantFP::getNaN(In->getType());

	// Propagate the existing NaN constant when possible.
	// TODO: Should we quiet a signaling NaN?
	return In;
	}

	static Constant simplifyFPBinop(Value Op0, Value *Op1) {
	if (isa<UndefValue>(Op0) \|\| isa<UndefValue>(Op1))
	return ConstantFP::getNaN(Op0->getType());

	if (match(Op0, m_NaN()))
	return propagateNaN(cast<Constant>(Op0));
	if (match(Op1, m_NaN()))
	return propagateNaN(cast<Constant>(Op1));

	return nullptr;
	}

	/// Given operands for an FAdd, see if we can fold the result. If not, this
	/// returns null.
	static Value SimplifyFAddInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
	return C;

	if (Constant *C = simplifyFPBinop(Op0, Op1))
	return C;

	// fadd X, -0 ==> X
	if (match(Op1, m_NegZeroFP()))
	return Op0;

	// fadd X, 0 ==> X, when we know X is not -0
	if (match(Op1, m_PosZeroFP()) &&
	(FMF.noSignedZeros() \|\| CannotBeNegativeZero(Op0, Q.TLI)))
	return Op0;

	// With nnan: -X + X --> 0.0 (and commuted variant)
	// We don't have to explicitly exclude infinities (ninf): INF + -INF == NaN.
	// Negative zeros are allowed because we always end up with positive zero:
	// X = -0.0: (-0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0
	// X = -0.0: ( 0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0
	// X = 0.0: (-0.0 - ( 0.0)) + ( 0.0) == (-0.0) + ( 0.0) == 0.0
	// X = 0.0: ( 0.0 - ( 0.0)) + ( 0.0) == ( 0.0) + ( 0.0) == 0.0
	if (FMF.noNaNs()) {
	if (match(Op0, m_FSub(m_AnyZeroFP(), m_Specific(Op1))) \|\|
	match(Op1, m_FSub(m_AnyZeroFP(), m_Specific(Op0))))
	return ConstantFP::getNullValue(Op0->getType());

	if (match(Op0, m_FNeg(m_Specific(Op1))) \|\|
	match(Op1, m_FNeg(m_Specific(Op0))))
	return ConstantFP::getNullValue(Op0->getType());
	}

	// (X - Y) + Y --> X
	// Y + (X - Y) --> X
	Value *X;
	if (FMF.noSignedZeros() && FMF.allowReassoc() &&
	(match(Op0, m_FSub(m_Value(X), m_Specific(Op1))) \|\|
	match(Op1, m_FSub(m_Value(X), m_Specific(Op0)))))
	return X;

	return nullptr;
	}

	/// Given operands for an FSub, see if we can fold the result. If not, this
	/// returns null.
	static Value SimplifyFSubInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
	return C;

	if (Constant *C = simplifyFPBinop(Op0, Op1))
	return C;

	// fsub X, +0 ==> X
	if (match(Op1, m_PosZeroFP()))
	return Op0;

	// fsub X, -0 ==> X, when we know X is not -0
	if (match(Op1, m_NegZeroFP()) &&
	(FMF.noSignedZeros() \|\| CannotBeNegativeZero(Op0, Q.TLI)))
	return Op0;

	// fsub -0.0, (fsub -0.0, X) ==> X
	// fsub -0.0, (fneg X) ==> X
	Value *X;
	if (match(Op0, m_NegZeroFP()) &&
	match(Op1, m_FNeg(m_Value(X))))
	return X;

	// fsub 0.0, (fsub 0.0, X) ==> X if signed zeros are ignored.
	// fsub 0.0, (fneg X) ==> X if signed zeros are ignored.
	if (FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()) &&
	(match(Op1, m_FSub(m_AnyZeroFP(), m_Value(X))) \|\|
	match(Op1, m_FNeg(m_Value(X)))))
	return X;

	// fsub nnan x, x ==> 0.0
	if (FMF.noNaNs() && Op0 == Op1)
	return Constant::getNullValue(Op0->getType());

	// Y - (Y - X) --> X
	// (X + Y) - Y --> X
	if (FMF.noSignedZeros() && FMF.allowReassoc() &&
	(match(Op1, m_FSub(m_Specific(Op0), m_Value(X))) \|\|
	match(Op0, m_c_FAdd(m_Specific(Op1), m_Value(X)))))
	return X;

	return nullptr;
	}

	/// Given the operands for an FMul, see if we can fold the result
	static Value SimplifyFMulInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
	return C;

	if (Constant *C = simplifyFPBinop(Op0, Op1))
	return C;

	// fmul X, 1.0 ==> X
	if (match(Op1, m_FPOne()))
	return Op0;

	// fmul nnan nsz X, 0 ==> 0
	if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZeroFP()))
	return ConstantFP::getNullValue(Op0->getType());

	// sqrt(X) * sqrt(X) --> X, if we can:
	// 1. Remove the intermediate rounding (reassociate).
	// 2. Ignore non-zero negative numbers because sqrt would produce NAN.
	// 3. Ignore -0.0 because sqrt(-0.0) == -0.0, but -0.0 * -0.0 == 0.0.
	Value *X;
	if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) &&
	FMF.allowReassoc() && FMF.noNaNs() && FMF.noSignedZeros())
	return X;

	return nullptr;
	}

	Value llvm::SimplifyFAddInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q) {
	return ::SimplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit);
	}


	Value llvm::SimplifyFSubInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q) {
	return ::SimplifyFSubInst(Op0, Op1, FMF, Q, RecursionLimit);
	}

	Value llvm::SimplifyFMulInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q) {
	return ::SimplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit);
	}

	static Value SimplifyFDivInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q, unsigned) {
	if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
	return C;

	if (Constant *C = simplifyFPBinop(Op0, Op1))
	return C;

	// X / 1.0 -> X
	if (match(Op1, m_FPOne()))
	return Op0;

	// 0 / X -> 0
	// Requires that NaNs are off (X could be zero) and signed zeroes are
	// ignored (X could be positive or negative, so the output sign is unknown).
	if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()))
	return ConstantFP::getNullValue(Op0->getType());

	if (FMF.noNaNs()) {
	// X / X -> 1.0 is legal when NaNs are ignored.
	// We can ignore infinities because INF/INF is NaN.
	if (Op0 == Op1)
	return ConstantFP::get(Op0->getType(), 1.0);

	// (X * Y) / Y --> X if we can reassociate to the above form.
	Value *X;
	if (FMF.allowReassoc() && match(Op0, m_c_FMul(m_Value(X), m_Specific(Op1))))
	return X;

	// -X / X -> -1.0 and
	// X / -X -> -1.0 are legal when NaNs are ignored.
	// We can ignore signed zeros because +-0.0/+-0.0 is NaN and ignored.
	if (match(Op0, m_FNegNSZ(m_Specific(Op1))) \|\|
	match(Op1, m_FNegNSZ(m_Specific(Op0))))
	return ConstantFP::get(Op0->getType(), -1.0);
	}

	return nullptr;
	}

	Value llvm::SimplifyFDivInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q) {
	return ::SimplifyFDivInst(Op0, Op1, FMF, Q, RecursionLimit);
	}

	static Value SimplifyFRemInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q, unsigned) {
	if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
	return C;

	if (Constant *C = simplifyFPBinop(Op0, Op1))
	return C;

	// Unlike fdiv, the result of frem always matches the sign of the dividend.
	// The constant match may include undef elements in a vector, so return a full
	// zero constant as the result.
	if (FMF.noNaNs()) {
	// +0 % X -> 0
	if (match(Op0, m_PosZeroFP()))
	return ConstantFP::getNullValue(Op0->getType());
	// -0 % X -> -0
	if (match(Op0, m_NegZeroFP()))
	return ConstantFP::getNegativeZero(Op0->getType());
	}

	return nullptr;
	}

	Value llvm::SimplifyFRemInst(Value Op0, Value *Op1, FastMathFlags FMF,
	const SimplifyQuery &Q) {
	return ::SimplifyFRemInst(Op0, Op1, FMF, Q, RecursionLimit);
	}

	//=== Helper functions for higher up the class hierarchy.

	/// Given the operand for a UnaryOperator, see if we can fold the result.
	/// If not, this returns null.
	static Value simplifyUnOp(unsigned Opcode, Value Op, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	switch (Opcode) {
	case Instruction::FNeg:
	return simplifyFNegInst(Op, FastMathFlags(), Q, MaxRecurse);
	default:
	llvm_unreachable("Unexpected opcode");
	}
	}

	/// Given the operand for a UnaryOperator, see if we can fold the result.
	/// If not, this returns null.
	/// In contrast to SimplifyUnOp, try to use FastMathFlag when folding the
	/// result. In case we don't need FastMathFlags, simply fall to SimplifyUnOp.
	static Value simplifyFPUnOp(unsigned Opcode, Value Op,
	const FastMathFlags &FMF,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	switch (Opcode) {
	case Instruction::FNeg:
	return simplifyFNegInst(Op, FMF, Q, MaxRecurse);
	default:
	return simplifyUnOp(Opcode, Op, Q, MaxRecurse);
	}
	}

	Value llvm::SimplifyUnOp(unsigned Opcode, Value Op, const SimplifyQuery &Q) {
	return ::simplifyUnOp(Opcode, Op, Q, RecursionLimit);
	}

	Value llvm::SimplifyFPUnOp(unsigned Opcode, Value Op, FastMathFlags FMF,
	const SimplifyQuery &Q) {
	return ::simplifyFPUnOp(Opcode, Op, FMF, Q, RecursionLimit);
	}

	/// Given operands for a BinaryOperator, see if we can fold the result.
	/// If not, this returns null.
	static Value SimplifyBinOp(unsigned Opcode, Value LHS, Value *RHS,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	switch (Opcode) {
	case Instruction::Add:
	return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse);
	case Instruction::Sub:
	return SimplifySubInst(LHS, RHS, false, false, Q, MaxRecurse);
	case Instruction::Mul:
	return SimplifyMulInst(LHS, RHS, Q, MaxRecurse);
	case Instruction::SDiv:
	return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
	case Instruction::UDiv:
	return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
	case Instruction::SRem:
	return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
	case Instruction::URem:
	return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
	case Instruction::Shl:
	return SimplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse);
	case Instruction::LShr:
	return SimplifyLShrInst(LHS, RHS, false, Q, MaxRecurse);
	case Instruction::AShr:
	return SimplifyAShrInst(LHS, RHS, false, Q, MaxRecurse);
	case Instruction::And:
	return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
	case Instruction::Or:
	return SimplifyOrInst(LHS, RHS, Q, MaxRecurse);
	case Instruction::Xor:
	return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
	case Instruction::FAdd:
	return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
	case Instruction::FSub:
	return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
	case Instruction::FMul:
	return SimplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
	case Instruction::FDiv:
	return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
	case Instruction::FRem:
	return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
	default:
	llvm_unreachable("Unexpected opcode");
	}
	}

	/// Given operands for a BinaryOperator, see if we can fold the result.
	/// If not, this returns null.
	/// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
	/// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
	static Value SimplifyFPBinOp(unsigned Opcode, Value LHS, Value *RHS,
	const FastMathFlags &FMF, const SimplifyQuery &Q,
	unsigned MaxRecurse) {
	switch (Opcode) {
	case Instruction::FAdd:
	return SimplifyFAddInst(LHS, RHS, FMF, Q, MaxRecurse);
	case Instruction::FSub:
	return SimplifyFSubInst(LHS, RHS, FMF, Q, MaxRecurse);
	case Instruction::FMul:
	return SimplifyFMulInst(LHS, RHS, FMF, Q, MaxRecurse);
	case Instruction::FDiv:
	return SimplifyFDivInst(LHS, RHS, FMF, Q, MaxRecurse);
	default:
	return SimplifyBinOp(Opcode, LHS, RHS, Q, MaxRecurse);
	}
	}

	Value llvm::SimplifyBinOp(unsigned Opcode, Value LHS, Value *RHS,
	const SimplifyQuery &Q) {
	return ::SimplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit);
	}

	Value llvm::SimplifyFPBinOp(unsigned Opcode, Value LHS, Value *RHS,
	FastMathFlags FMF, const SimplifyQuery &Q) {
	return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit);
	}

	/// Given operands for a CmpInst, see if we can fold the result.
	static Value SimplifyCmpInst(unsigned Predicate, Value LHS, Value *RHS,
	const SimplifyQuery &Q, unsigned MaxRecurse) {
	if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
	return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
	return SimplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse);
	}

	Value llvm::SimplifyCmpInst(unsigned Predicate, Value LHS, Value *RHS,
	const SimplifyQuery &Q) {
	return ::SimplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
	}

	static bool IsIdempotent(Intrinsic::ID ID) {
	switch (ID) {
	default: return false;

	// Unary idempotent: f(f(x)) = f(x)
	case Intrinsic::fabs:
	case Intrinsic::floor:
	case Intrinsic::ceil:
	case Intrinsic::trunc:
	case Intrinsic::rint:
	case Intrinsic::nearbyint:
	case Intrinsic::round:
	case Intrinsic::canonicalize:
	return true;
	}
	}

	static Value SimplifyRelativeLoad(Constant Ptr, Constant *Offset,
	const DataLayout &DL) {
	GlobalValue *PtrSym;
	APInt PtrOffset;
	if (!IsConstantOffsetFromGlobal(Ptr, PtrSym, PtrOffset, DL))
	return nullptr;

	Type *Int8PtrTy = Type::getInt8PtrTy(Ptr->getContext());
	Type *Int32Ty = Type::getInt32Ty(Ptr->getContext());
	Type *Int32PtrTy = Int32Ty->getPointerTo();
	Type *Int64Ty = Type::getInt64Ty(Ptr->getContext());

	auto *OffsetConstInt = dyn_cast<ConstantInt>(Offset);
	if (!OffsetConstInt \|\| OffsetConstInt->getType()->getBitWidth() > 64)
	return nullptr;

	uint64_t OffsetInt = OffsetConstInt->getSExtValue();
	if (OffsetInt % 4 != 0)
	return nullptr;

	Constant *C = ConstantExpr::getGetElementPtr(
	Int32Ty, ConstantExpr::getBitCast(Ptr, Int32PtrTy),
	ConstantInt::get(Int64Ty, OffsetInt / 4));
	Constant *Loaded = ConstantFoldLoadFromConstPtr(C, Int32Ty, DL);
	if (!Loaded)
	return nullptr;

	auto *LoadedCE = dyn_cast<ConstantExpr>(Loaded);
	if (!LoadedCE)
	return nullptr;

	if (LoadedCE->getOpcode() == Instruction::Trunc) {
	LoadedCE = dyn_cast<ConstantExpr>(LoadedCE->getOperand(0));
	if (!LoadedCE)
	return nullptr;
	}

	if (LoadedCE->getOpcode() != Instruction::Sub)
	return nullptr;

	auto *LoadedLHS = dyn_cast<ConstantExpr>(LoadedCE->getOperand(0));
	if (!LoadedLHS \|\| LoadedLHS->getOpcode() != Instruction::PtrToInt)
	return nullptr;
	auto *LoadedLHSPtr = LoadedLHS->getOperand(0);

	Constant *LoadedRHS = LoadedCE->getOperand(1);
	GlobalValue *LoadedRHSSym;
	APInt LoadedRHSOffset;
	if (!IsConstantOffsetFromGlobal(LoadedRHS, LoadedRHSSym, LoadedRHSOffset,
	DL) \|\|
	PtrSym != LoadedRHSSym \|\| PtrOffset != LoadedRHSOffset)
	return nullptr;

	return ConstantExpr::getBitCast(LoadedLHSPtr, Int8PtrTy);
	}

	static Value simplifyUnaryIntrinsic(Function F, Value *Op0,
	const SimplifyQuery &Q) {
	// Idempotent functions return the same result when called repeatedly.
	Intrinsic::ID IID = F->getIntrinsicID();
	if (IsIdempotent(IID))
	if (auto *II = dyn_cast<IntrinsicInst>(Op0))
	if (II->getIntrinsicID() == IID)
	return II;

	Value *X;
	switch (IID) {
	case Intrinsic::fabs:
	if (SignBitMustBeZero(Op0, Q.TLI)) return Op0;
	break;
	case Intrinsic::bswap:
	// bswap(bswap(x)) -> x
	if (match(Op0, m_BSwap(m_Value(X)))) return X;
	break;
	case Intrinsic::bitreverse:
	// bitreverse(bitreverse(x)) -> x
	if (match(Op0, m_BitReverse(m_Value(X)))) return X;
	break;
	case Intrinsic::exp:
	// exp(log(x)) -> x
	if (Q.CxtI->hasAllowReassoc() &&
	match(Op0, m_Intrinsic<Intrinsic::log>(m_Value(X)))) return X;
	break;
	case Intrinsic::exp2:
	// exp2(log2(x)) -> x
	if (Q.CxtI->hasAllowReassoc() &&
	match(Op0, m_Intrinsic<Intrinsic::log2>(m_Value(X)))) return X;
	break;
	case Intrinsic::log:
	// log(exp(x)) -> x
	if (Q.CxtI->hasAllowReassoc() &&
	match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X)))) return X;
	break;
	case Intrinsic::log2:
	// log2(exp2(x)) -> x
	if (Q.CxtI->hasAllowReassoc() &&
	(match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X))) \|\|
	match(Op0, m_Intrinsic<Intrinsic::pow>(m_SpecificFP(2.0),
	m_Value(X))))) return X;
	break;
	case Intrinsic::log10:
	// log10(pow(10.0, x)) -> x
	if (Q.CxtI->hasAllowReassoc() &&
	match(Op0, m_Intrinsic<Intrinsic::pow>(m_SpecificFP(10.0),
	m_Value(X)))) return X;
	break;
	case Intrinsic::floor:
	case Intrinsic::trunc:
	case Intrinsic::ceil:
	case Intrinsic::round:
	case Intrinsic::nearbyint:
	case Intrinsic::rint: {
	// floor (sitofp x) -> sitofp x
	// floor (uitofp x) -> uitofp x
	//
	// Converting from int always results in a finite integral number or
	// infinity. For either of those inputs, these rounding functions always
	// return the same value, so the rounding can be eliminated.
	if (match(Op0, m_SIToFP(m_Value())) \|\| match(Op0, m_UIToFP(m_Value())))
	return Op0;
	break;
	}
	default:
	break;
	}

	return nullptr;
	}

	static Value simplifyBinaryIntrinsic(Function F, Value Op0, Value Op1,
	const SimplifyQuery &Q) {
	Intrinsic::ID IID = F->getIntrinsicID();
	Type *ReturnType = F->getReturnType();
	switch (IID) {
	case Intrinsic::usub_with_overflow:
	case Intrinsic::ssub_with_overflow:
	// X - X -> { 0, false }
	if (Op0 == Op1)
	return Constant::getNullValue(ReturnType);
	LLVM_FALLTHROUGH;
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::sadd_with_overflow:
	// X - undef -> { undef, false }
	// undef - X -> { undef, false }
	// X + undef -> { undef, false }
	// undef + x -> { undef, false }
	if (isa<UndefValue>(Op0) \|\| isa<UndefValue>(Op1)) {
	return ConstantStruct::get(
	cast<StructType>(ReturnType),
	{UndefValue::get(ReturnType->getStructElementType(0)),
	Constant::getNullValue(ReturnType->getStructElementType(1))});
	}
	break;
	case Intrinsic::umul_with_overflow:
	case Intrinsic::smul_with_overflow:
	// 0 * X -> { 0, false }
	// X * 0 -> { 0, false }
	if (match(Op0, m_Zero()) \|\| match(Op1, m_Zero()))
	return Constant::getNullValue(ReturnType);
	// undef * X -> { 0, false }
	// X * undef -> { 0, false }
	if (match(Op0, m_Undef()) \|\| match(Op1, m_Undef()))
	return Constant::getNullValue(ReturnType);
	break;
	case Intrinsic::uadd_sat:
	// sat(MAX + X) -> MAX
	// sat(X + MAX) -> MAX
	if (match(Op0, m_AllOnes()) \|\| match(Op1, m_AllOnes()))
	return Constant::getAllOnesValue(ReturnType);
	LLVM_FALLTHROUGH;
	case Intrinsic::sadd_sat:
	// sat(X + undef) -> -1
	// sat(undef + X) -> -1
	// For unsigned: Assume undef is MAX, thus we saturate to MAX (-1).
	// For signed: Assume undef is ~X, in which case X + ~X = -1.
	if (match(Op0, m_Undef()) \|\| match(Op1, m_Undef()))
	return Constant::getAllOnesValue(ReturnType);

	// X + 0 -> X
	if (match(Op1, m_Zero()))
	return Op0;
	// 0 + X -> X
	if (match(Op0, m_Zero()))
	return Op1;
	break;
	case Intrinsic::usub_sat:
	// sat(0 - X) -> 0, sat(X - MAX) -> 0
	if (match(Op0, m_Zero()) \|\| match(Op1, m_AllOnes()))
	return Constant::getNullValue(ReturnType);
	LLVM_FALLTHROUGH;
	case Intrinsic::ssub_sat:
	// X - X -> 0, X - undef -> 0, undef - X -> 0
	if (Op0 == Op1 \|\| match(Op0, m_Undef()) \|\| match(Op1, m_Undef()))
	return Constant::getNullValue(ReturnType);
	// X - 0 -> X
	if (match(Op1, m_Zero()))
	return Op0;
	break;
	case Intrinsic::load_relative:
	if (auto *C0 = dyn_cast<Constant>(Op0))
	if (auto *C1 = dyn_cast<Constant>(Op1))
	return SimplifyRelativeLoad(C0, C1, Q.DL);
	break;
	case Intrinsic::powi:
	if (auto *Power = dyn_cast<ConstantInt>(Op1)) {
	// powi(x, 0) -> 1.0
	if (Power->isZero())
	return ConstantFP::get(Op0->getType(), 1.0);
	// powi(x, 1) -> x
	if (Power->isOne())
	return Op0;
	}
	break;
	case Intrinsic::maxnum:
	case Intrinsic::minnum:
	case Intrinsic::maximum:
	case Intrinsic::minimum: {
	// If the arguments are the same, this is a no-op.
	if (Op0 == Op1) return Op0;

	// If one argument is undef, return the other argument.
	if (match(Op0, m_Undef()))
	return Op1;
	if (match(Op1, m_Undef()))
	return Op0;

	// If one argument is NaN, return other or NaN appropriately.
	bool PropagateNaN = IID == Intrinsic::minimum \|\| IID == Intrinsic::maximum;
	if (match(Op0, m_NaN()))
	return PropagateNaN ? Op0 : Op1;
	if (match(Op1, m_NaN()))
	return PropagateNaN ? Op1 : Op0;

	// Min/max of the same operation with common operand:
	// m(m(X, Y)), X --> m(X, Y) (4 commuted variants)
	if (auto *M0 = dyn_cast<IntrinsicInst>(Op0))
	if (M0->getIntrinsicID() == IID &&
	(M0->getOperand(0) == Op1 \|\| M0->getOperand(1) == Op1))
	return Op0;
	if (auto *M1 = dyn_cast<IntrinsicInst>(Op1))
	if (M1->getIntrinsicID() == IID &&
	(M1->getOperand(0) == Op0 \|\| M1->getOperand(1) == Op0))
	return Op1;

	// min(X, -Inf) --> -Inf (and commuted variant)
	// max(X, +Inf) --> +Inf (and commuted variant)
	bool UseNegInf = IID == Intrinsic::minnum \|\| IID == Intrinsic::minimum;
	const APFloat *C;
	if ((match(Op0, m_APFloat(C)) && C->isInfinity() &&
	C->isNegative() == UseNegInf) \|\|
	(match(Op1, m_APFloat(C)) && C->isInfinity() &&
	C->isNegative() == UseNegInf))
	return ConstantFP::getInfinity(ReturnType, UseNegInf);

	// TODO: minnum(nnan x, inf) -> x
	// TODO: minnum(nnan ninf x, flt_max) -> x
	// TODO: maxnum(nnan x, -inf) -> x
	// TODO: maxnum(nnan ninf x, -flt_max) -> x
	break;
	}
	default:
	break;
	}

	return nullptr;
	}

	static Value simplifyIntrinsic(CallBase Call, const SimplifyQuery &Q) {

	// Intrinsics with no operands have some kind of side effect. Don't simplify.
	unsigned NumOperands = Call->getNumArgOperands();
	if (!NumOperands)
	return nullptr;

	Function *F = cast<Function>(Call->getCalledFunction());
	Intrinsic::ID IID = F->getIntrinsicID();
	if (NumOperands == 1)
	return simplifyUnaryIntrinsic(F, Call->getArgOperand(0), Q);

	if (NumOperands == 2)
	return simplifyBinaryIntrinsic(F, Call->getArgOperand(0),
	Call->getArgOperand(1), Q);

	// Handle intrinsics with 3 or more arguments.
	switch (IID) {
	case Intrinsic::masked_load:
	case Intrinsic::masked_gather: {
	Value *MaskArg = Call->getArgOperand(2);
	Value *PassthruArg = Call->getArgOperand(3);
	// If the mask is all zeros or undef, the "passthru" argument is the result.
	if (maskIsAllZeroOrUndef(MaskArg))
	return PassthruArg;
	return nullptr;
	}
	case Intrinsic::fshl:
	case Intrinsic::fshr: {
	Value Op0 = Call->getArgOperand(0), Op1 = Call->getArgOperand(1),
	*ShAmtArg = Call->getArgOperand(2);

	// If both operands are undef, the result is undef.
	if (match(Op0, m_Undef()) && match(Op1, m_Undef()))
	return UndefValue::get(F->getReturnType());

	// If shift amount is undef, assume it is zero.
	if (match(ShAmtArg, m_Undef()))
	return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1);

	const APInt *ShAmtC;
	if (match(ShAmtArg, m_APInt(ShAmtC))) {
	// If there's effectively no shift, return the 1st arg or 2nd arg.
	APInt BitWidth = APInt(ShAmtC->getBitWidth(), ShAmtC->getBitWidth());
	if (ShAmtC->urem(BitWidth).isNullValue())
	return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1);
	}
	return nullptr;
	}
	default:
	return nullptr;
	}
	}

	Value llvm::SimplifyCall(CallBase Call, const SimplifyQuery &Q) {
	Value *Callee = Call->getCalledValue();

	// call undef -> undef
	// call null -> undef
	if (isa<UndefValue>(Callee) \|\| isa<ConstantPointerNull>(Callee))
	return UndefValue::get(Call->getType());

	Function *F = dyn_cast<Function>(Callee);
	if (!F)
	return nullptr;

	if (F->isIntrinsic())
	if (Value *Ret = simplifyIntrinsic(Call, Q))
	return Ret;

	if (!canConstantFoldCallTo(Call, F))
	return nullptr;

	SmallVector<Constant *, 4> ConstantArgs;
	unsigned NumArgs = Call->getNumArgOperands();
	ConstantArgs.reserve(NumArgs);
	for (auto &Arg : Call->args()) {
	Constant *C = dyn_cast<Constant>(&Arg);
	if (!C)
	return nullptr;
	ConstantArgs.push_back(C);
	}

	return ConstantFoldCall(Call, F, ConstantArgs, Q.TLI);
	}

	/// See if we can compute a simplified version of this instruction.
	/// If not, this returns null.

	Value llvm::SimplifyInstruction(Instruction I, const SimplifyQuery &SQ,
	OptimizationRemarkEmitter *ORE) {
	const SimplifyQuery Q = SQ.CxtI ? SQ : SQ.getWithInstruction(I);
	Value *Result;

	switch (I->getOpcode()) {
	default:
	Result = ConstantFoldInstruction(I, Q.DL, Q.TLI);
	break;
	case Instruction::FNeg:
	Result = SimplifyFNegInst(I->getOperand(0), I->getFastMathFlags(), Q);
	break;
	case Instruction::FAdd:
	Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1),
	I->getFastMathFlags(), Q);
	break;
	case Instruction::Add:
	Result =
	SimplifyAddInst(I->getOperand(0), I->getOperand(1),
	Q.IIQ.hasNoSignedWrap(cast<BinaryOperator>(I)),
	Q.IIQ.hasNoUnsignedWrap(cast<BinaryOperator>(I)), Q);
	break;
	case Instruction::FSub:
	Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1),
	I->getFastMathFlags(), Q);
	break;
	case Instruction::Sub:
	Result =
	SimplifySubInst(I->getOperand(0), I->getOperand(1),
	Q.IIQ.hasNoSignedWrap(cast<BinaryOperator>(I)),
	Q.IIQ.hasNoUnsignedWrap(cast<BinaryOperator>(I)), Q);
	break;
	case Instruction::FMul:
	Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1),
	I->getFastMathFlags(), Q);
	break;
	case Instruction::Mul:
	Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::SDiv:
	Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::UDiv:
	Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::FDiv:
	Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1),
	I->getFastMathFlags(), Q);
	break;
	case Instruction::SRem:
	Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::URem:
	Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::FRem:
	Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1),
	I->getFastMathFlags(), Q);
	break;
	case Instruction::Shl:
	Result =
	SimplifyShlInst(I->getOperand(0), I->getOperand(1),
	Q.IIQ.hasNoSignedWrap(cast<BinaryOperator>(I)),
	Q.IIQ.hasNoUnsignedWrap(cast<BinaryOperator>(I)), Q);
	break;
	case Instruction::LShr:
	Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1),
	Q.IIQ.isExact(cast<BinaryOperator>(I)), Q);
	break;
	case Instruction::AShr:
	Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1),
	Q.IIQ.isExact(cast<BinaryOperator>(I)), Q);
	break;
	case Instruction::And:
	Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::Or:
	Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::Xor:
	Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::ICmp:
	Result = SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(),
	I->getOperand(0), I->getOperand(1), Q);
	break;
	case Instruction::FCmp:
	Result =
	SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), I->getOperand(0),
	I->getOperand(1), I->getFastMathFlags(), Q);
	break;
	case Instruction::Select:
	Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
	I->getOperand(2), Q);
	break;
	case Instruction::GetElementPtr: {
	SmallVector<Value *, 8> Ops(I->op_begin(), I->op_end());
	Result = SimplifyGEPInst(cast<GetElementPtrInst>(I)->getSourceElementType(),
	Ops, Q);
	break;
	}
	case Instruction::InsertValue: {
	InsertValueInst *IV = cast<InsertValueInst>(I);
	Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
	IV->getInsertedValueOperand(),
	IV->getIndices(), Q);
	break;
	}
	case Instruction::InsertElement: {
	auto *IE = cast<InsertElementInst>(I);
	Result = SimplifyInsertElementInst(IE->getOperand(0), IE->getOperand(1),
	IE->getOperand(2), Q);
	break;
	}
	case Instruction::ExtractValue: {
	auto *EVI = cast<ExtractValueInst>(I);
	Result = SimplifyExtractValueInst(EVI->getAggregateOperand(),
	EVI->getIndices(), Q);
	break;
	}
	case Instruction::ExtractElement: {
	auto *EEI = cast<ExtractElementInst>(I);
	Result = SimplifyExtractElementInst(EEI->getVectorOperand(),
	EEI->getIndexOperand(), Q);
	break;
	}
	case Instruction::ShuffleVector: {
	auto *SVI = cast<ShuffleVectorInst>(I);
	Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
	SVI->getMask(), SVI->getType(), Q);
	break;
	}
	case Instruction::PHI:
	Result = SimplifyPHINode(cast<PHINode>(I), Q);
	break;
	case Instruction::Call: {
	Result = SimplifyCall(cast<CallInst>(I), Q);
	break;
	}
	#define HANDLE_CAST_INST(num, opc, clas) case Instruction::opc:
	#include "llvm/IR/Instruction.def"
	#undef HANDLE_CAST_INST
	Result =
	SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(), Q);
	break;
	case Instruction::Alloca:
	// No simplifications for Alloca and it can't be constant folded.
	Result = nullptr;
	break;
	}

	// In general, it is possible for computeKnownBits to determine all bits in a
	// value even when the operands are not all constants.
	if (!Result && I->getType()->isIntOrIntVectorTy()) {
	KnownBits Known = computeKnownBits(I, Q.DL, /Depth/ 0, Q.AC, I, Q.DT, ORE);
	if (Known.isConstant())
	Result = ConstantInt::get(I->getType(), Known.getConstant());
	}

	/// If called on unreachable code, the above logic may report that the
	/// instruction simplified to itself. Make life easier for users by
	/// detecting that case here, returning a safe value instead.
	return Result == I ? UndefValue::get(I->getType()) : Result;
	}

	/// Implementation of recursive simplification through an instruction's
	/// uses.
	///
	/// This is the common implementation of the recursive simplification routines.
	/// If we have a pre-simplified value in 'SimpleV', that is forcibly used to
	/// replace the instruction 'I'. Otherwise, we simply add 'I' to the list of
	/// instructions to process and attempt to simplify it using
	-/// InstructionSimplify.
	+/// InstructionSimplify. Recursively visited users which could not be
	+/// simplified themselves are to the optional UnsimplifiedUsers set for
	+/// further processing by the caller.
	///
	/// This routine returns 'true' only when it simplifies something. The passed
	/// in simplified value does not count toward this.
	-static bool replaceAndRecursivelySimplifyImpl(Instruction I, Value SimpleV,
	- const TargetLibraryInfo *TLI,
	- const DominatorTree *DT,
	- AssumptionCache *AC) {
	+static bool replaceAndRecursivelySimplifyImpl(
	+ Instruction I, Value SimpleV, const TargetLibraryInfo *TLI,
	+ const DominatorTree DT, AssumptionCache AC,
	+ SmallSetVector<Instruction , 8> UnsimplifiedUsers = nullptr) {
	bool Simplified = false;
	SmallSetVector<Instruction *, 8> Worklist;
	const DataLayout &DL = I->getModule()->getDataLayout();

	// If we have an explicit value to collapse to, do that round of the
	// simplification loop by hand initially.
	if (SimpleV) {
	for (User *U : I->users())
	if (U != I)
	Worklist.insert(cast<Instruction>(U));

	// Replace the instruction with its simplified value.
	I->replaceAllUsesWith(SimpleV);

	// Gracefully handle edge cases where the instruction is not wired into any
	// parent block.
	if (I->getParent() && !I->isEHPad() && !I->isTerminator() &&
	!I->mayHaveSideEffects())
	I->eraseFromParent();
	} else {
	Worklist.insert(I);
	}

	// Note that we must test the size on each iteration, the worklist can grow.
	for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
	I = Worklist[Idx];

	// See if this instruction simplifies.
	SimpleV = SimplifyInstruction(I, {DL, TLI, DT, AC});
	- if (!SimpleV)
	+ if (!SimpleV) {
	+ if (UnsimplifiedUsers)
	+ UnsimplifiedUsers->insert(I);
	continue;
	+ }

	Simplified = true;

	// Stash away all the uses of the old instruction so we can check them for
	// recursive simplifications after a RAUW. This is cheaper than checking all
	// uses of To on the recursive step in most cases.
	for (User *U : I->users())
	Worklist.insert(cast<Instruction>(U));

	// Replace the instruction with its simplified value.
	I->replaceAllUsesWith(SimpleV);

	// Gracefully handle edge cases where the instruction is not wired into any
	// parent block.
	if (I->getParent() && !I->isEHPad() && !I->isTerminator() &&
	!I->mayHaveSideEffects())
	I->eraseFromParent();
	}
	return Simplified;
	}

	bool llvm::recursivelySimplifyInstruction(Instruction *I,
	const TargetLibraryInfo *TLI,
	const DominatorTree *DT,
	AssumptionCache *AC) {
	- return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC);
	+ return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC, nullptr);
	}

	-bool llvm::replaceAndRecursivelySimplify(Instruction I, Value SimpleV,
	- const TargetLibraryInfo *TLI,
	- const DominatorTree *DT,
	- AssumptionCache *AC) {
	+bool llvm::replaceAndRecursivelySimplify(
	+ Instruction I, Value SimpleV, const TargetLibraryInfo *TLI,
	+ const DominatorTree DT, AssumptionCache AC,
	+ SmallSetVector<Instruction , 8> UnsimplifiedUsers) {
	assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
	assert(SimpleV && "Must provide a simplified value.");
	- return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC);
	+ return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC,
	+ UnsimplifiedUsers);
	}

	namespace llvm {
	const SimplifyQuery getBestSimplifyQuery(Pass &P, Function &F) {
	auto *DTWP = P.getAnalysisIfAvailable<DominatorTreeWrapperPass>();
	auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
	auto *TLIWP = P.getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
	auto *TLI = TLIWP ? &TLIWP->getTLI() : nullptr;
	auto *ACWP = P.getAnalysisIfAvailable<AssumptionCacheTracker>();
	auto *AC = ACWP ? &ACWP->getAssumptionCache(F) : nullptr;
	return {F.getParent()->getDataLayout(), TLI, DT, AC};
	}

	const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &AR,
	const DataLayout &DL) {
	return {DL, &AR.TLI, &AR.DT, &AR.AC};
	}

	template <class T, class... TArgs>
	const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &AM,
	Function &F) {
	auto *DT = AM.template getCachedResult<DominatorTreeAnalysis>(F);
	auto *TLI = AM.template getCachedResult<TargetLibraryAnalysis>(F);
	auto *AC = AM.template getCachedResult<AssumptionAnalysis>(F);
	return {F.getParent()->getDataLayout(), TLI, DT, AC};
	}
	template const SimplifyQuery getBestSimplifyQuery(AnalysisManager<Function> &,
	Function &);
	}
	Index: projects/clang900-import/contrib/llvm/lib/IR/Core.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/IR/Core.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/IR/Core.cpp (revision 351722)
	@@ -1,4043 +1,4052 @@
	//===-- Core.cpp ----------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the common infrastructure (including the C bindings)
	// for libLLVMCore.a, which implements the LLVM intermediate representation.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm-c/Core.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/DiagnosticPrinter.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/Threading.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <cstdlib>
	#include <cstring>
	#include <system_error>

	using namespace llvm;

	#define DEBUG_TYPE "ir"

	void llvm::initializeCore(PassRegistry &Registry) {
	initializeDominatorTreeWrapperPassPass(Registry);
	initializePrintModulePassWrapperPass(Registry);
	initializePrintFunctionPassWrapperPass(Registry);
	initializePrintBasicBlockPassPass(Registry);
	initializeSafepointIRVerifierPass(Registry);
	initializeVerifierLegacyPassPass(Registry);
	}

	void LLVMInitializeCore(LLVMPassRegistryRef R) {
	initializeCore(*unwrap(R));
	}

	void LLVMShutdown() {
	llvm_shutdown();
	}

	/===-- Error handling ----------------------------------------------------===/

	char LLVMCreateMessage(const char Message) {
	return strdup(Message);
	}

	void LLVMDisposeMessage(char *Message) {
	free(Message);
	}


	/===-- Operations on contexts --------------------------------------------===/

	static ManagedStatic<LLVMContext> GlobalContext;

	LLVMContextRef LLVMContextCreate() {
	return wrap(new LLVMContext());
	}

	LLVMContextRef LLVMGetGlobalContext() { return wrap(&*GlobalContext); }

	void LLVMContextSetDiagnosticHandler(LLVMContextRef C,
	LLVMDiagnosticHandler Handler,
	void *DiagnosticContext) {
	unwrap(C)->setDiagnosticHandlerCallBack(
	LLVM_EXTENSION reinterpret_cast<DiagnosticHandler::DiagnosticHandlerTy>(
	Handler),
	DiagnosticContext);
	}

	LLVMDiagnosticHandler LLVMContextGetDiagnosticHandler(LLVMContextRef C) {
	return LLVM_EXTENSION reinterpret_cast<LLVMDiagnosticHandler>(
	unwrap(C)->getDiagnosticHandlerCallBack());
	}

	void *LLVMContextGetDiagnosticContext(LLVMContextRef C) {
	return unwrap(C)->getDiagnosticContext();
	}

	void LLVMContextSetYieldCallback(LLVMContextRef C, LLVMYieldCallback Callback,
	void *OpaqueHandle) {
	auto YieldCallback =
	LLVM_EXTENSION reinterpret_cast<LLVMContext::YieldCallbackTy>(Callback);
	unwrap(C)->setYieldCallback(YieldCallback, OpaqueHandle);
	}

	LLVMBool LLVMContextShouldDiscardValueNames(LLVMContextRef C) {
	return unwrap(C)->shouldDiscardValueNames();
	}

	void LLVMContextSetDiscardValueNames(LLVMContextRef C, LLVMBool Discard) {
	unwrap(C)->setDiscardValueNames(Discard);
	}

	void LLVMContextDispose(LLVMContextRef C) {
	delete unwrap(C);
	}

	unsigned LLVMGetMDKindIDInContext(LLVMContextRef C, const char *Name,
	unsigned SLen) {
	return unwrap(C)->getMDKindID(StringRef(Name, SLen));
	}

	unsigned LLVMGetMDKindID(const char *Name, unsigned SLen) {
	return LLVMGetMDKindIDInContext(LLVMGetGlobalContext(), Name, SLen);
	}

	#define GET_ATTR_KIND_FROM_NAME
	#include "AttributesCompatFunc.inc"

	unsigned LLVMGetEnumAttributeKindForName(const char *Name, size_t SLen) {
	return getAttrKindFromName(StringRef(Name, SLen));
	}

	unsigned LLVMGetLastEnumAttributeKind(void) {
	return Attribute::AttrKind::EndAttrKinds;
	}

	LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID,
	uint64_t Val) {
	- return wrap(Attribute::get(*unwrap(C), (Attribute::AttrKind)KindID, Val));
	+ auto &Ctx = *unwrap(C);
	+ auto AttrKind = (Attribute::AttrKind)KindID;
	+
	+ if (AttrKind == Attribute::AttrKind::ByVal) {
	+ // After r362128, byval attributes need to have a type attribute. Provide a
	+ // NULL one until a proper API is added for this.
	+ return wrap(Attribute::getWithByValType(Ctx, NULL));
	+ } else {
	+ return wrap(Attribute::get(Ctx, AttrKind, Val));
	+ }
	}

	unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A) {
	return unwrap(A).getKindAsEnum();
	}

	uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A) {
	auto Attr = unwrap(A);
	if (Attr.isEnumAttribute())
	return 0;
	return Attr.getValueAsInt();
	}

	LLVMAttributeRef LLVMCreateStringAttribute(LLVMContextRef C,
	const char *K, unsigned KLength,
	const char *V, unsigned VLength) {
	return wrap(Attribute::get(*unwrap(C), StringRef(K, KLength),
	StringRef(V, VLength)));
	}

	const char *LLVMGetStringAttributeKind(LLVMAttributeRef A,
	unsigned *Length) {
	auto S = unwrap(A).getKindAsString();
	*Length = S.size();
	return S.data();
	}

	const char *LLVMGetStringAttributeValue(LLVMAttributeRef A,
	unsigned *Length) {
	auto S = unwrap(A).getValueAsString();
	*Length = S.size();
	return S.data();
	}

	LLVMBool LLVMIsEnumAttribute(LLVMAttributeRef A) {
	auto Attr = unwrap(A);
	return Attr.isEnumAttribute() \|\| Attr.isIntAttribute();
	}

	LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A) {
	return unwrap(A).isStringAttribute();
	}

	char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI) {
	std::string MsgStorage;
	raw_string_ostream Stream(MsgStorage);
	DiagnosticPrinterRawOStream DP(Stream);

	unwrap(DI)->print(DP);
	Stream.flush();

	return LLVMCreateMessage(MsgStorage.c_str());
	}

	LLVMDiagnosticSeverity LLVMGetDiagInfoSeverity(LLVMDiagnosticInfoRef DI) {
	LLVMDiagnosticSeverity severity;

	switch(unwrap(DI)->getSeverity()) {
	default:
	severity = LLVMDSError;
	break;
	case DS_Warning:
	severity = LLVMDSWarning;
	break;
	case DS_Remark:
	severity = LLVMDSRemark;
	break;
	case DS_Note:
	severity = LLVMDSNote;
	break;
	}

	return severity;
	}

	/===-- Operations on modules ---------------------------------------------===/

	LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) {
	return wrap(new Module(ModuleID, *GlobalContext));
	}

	LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID,
	LLVMContextRef C) {
	return wrap(new Module(ModuleID, *unwrap(C)));
	}

	void LLVMDisposeModule(LLVMModuleRef M) {
	delete unwrap(M);
	}

	const char LLVMGetModuleIdentifier(LLVMModuleRef M, size_t Len) {
	auto &Str = unwrap(M)->getModuleIdentifier();
	*Len = Str.length();
	return Str.c_str();
	}

	void LLVMSetModuleIdentifier(LLVMModuleRef M, const char *Ident, size_t Len) {
	unwrap(M)->setModuleIdentifier(StringRef(Ident, Len));
	}

	const char LLVMGetSourceFileName(LLVMModuleRef M, size_t Len) {
	auto &Str = unwrap(M)->getSourceFileName();
	*Len = Str.length();
	return Str.c_str();
	}

	void LLVMSetSourceFileName(LLVMModuleRef M, const char *Name, size_t Len) {
	unwrap(M)->setSourceFileName(StringRef(Name, Len));
	}

	/--.. Data layout .........................................................--/
	const char *LLVMGetDataLayoutStr(LLVMModuleRef M) {
	return unwrap(M)->getDataLayoutStr().c_str();
	}

	const char *LLVMGetDataLayout(LLVMModuleRef M) {
	return LLVMGetDataLayoutStr(M);
	}

	void LLVMSetDataLayout(LLVMModuleRef M, const char *DataLayoutStr) {
	unwrap(M)->setDataLayout(DataLayoutStr);
	}

	/--.. Target triple .......................................................--/
	const char * LLVMGetTarget(LLVMModuleRef M) {
	return unwrap(M)->getTargetTriple().c_str();
	}

	void LLVMSetTarget(LLVMModuleRef M, const char *Triple) {
	unwrap(M)->setTargetTriple(Triple);
	}

	/--.. Module flags ........................................................--/
	struct LLVMOpaqueModuleFlagEntry {
	LLVMModuleFlagBehavior Behavior;
	const char *Key;
	size_t KeyLen;
	LLVMMetadataRef Metadata;
	};

	static Module::ModFlagBehavior
	map_to_llvmModFlagBehavior(LLVMModuleFlagBehavior Behavior) {
	switch (Behavior) {
	case LLVMModuleFlagBehaviorError:
	return Module::ModFlagBehavior::Error;
	case LLVMModuleFlagBehaviorWarning:
	return Module::ModFlagBehavior::Warning;
	case LLVMModuleFlagBehaviorRequire:
	return Module::ModFlagBehavior::Require;
	case LLVMModuleFlagBehaviorOverride:
	return Module::ModFlagBehavior::Override;
	case LLVMModuleFlagBehaviorAppend:
	return Module::ModFlagBehavior::Append;
	case LLVMModuleFlagBehaviorAppendUnique:
	return Module::ModFlagBehavior::AppendUnique;
	}
	llvm_unreachable("Unknown LLVMModuleFlagBehavior");
	}

	static LLVMModuleFlagBehavior
	map_from_llvmModFlagBehavior(Module::ModFlagBehavior Behavior) {
	switch (Behavior) {
	case Module::ModFlagBehavior::Error:
	return LLVMModuleFlagBehaviorError;
	case Module::ModFlagBehavior::Warning:
	return LLVMModuleFlagBehaviorWarning;
	case Module::ModFlagBehavior::Require:
	return LLVMModuleFlagBehaviorRequire;
	case Module::ModFlagBehavior::Override:
	return LLVMModuleFlagBehaviorOverride;
	case Module::ModFlagBehavior::Append:
	return LLVMModuleFlagBehaviorAppend;
	case Module::ModFlagBehavior::AppendUnique:
	return LLVMModuleFlagBehaviorAppendUnique;
	default:
	llvm_unreachable("Unhandled Flag Behavior");
	}
	}

	LLVMModuleFlagEntry LLVMCopyModuleFlagsMetadata(LLVMModuleRef M, size_t Len) {
	SmallVector<Module::ModuleFlagEntry, 8> MFEs;
	unwrap(M)->getModuleFlagsMetadata(MFEs);

	LLVMOpaqueModuleFlagEntry Result = static_cast<LLVMOpaqueModuleFlagEntry >(
	safe_malloc(MFEs.size() * sizeof(LLVMOpaqueModuleFlagEntry)));
	for (unsigned i = 0; i < MFEs.size(); ++i) {
	const auto &ModuleFlag = MFEs[i];
	Result[i].Behavior = map_from_llvmModFlagBehavior(ModuleFlag.Behavior);
	Result[i].Key = ModuleFlag.Key->getString().data();
	Result[i].KeyLen = ModuleFlag.Key->getString().size();
	Result[i].Metadata = wrap(ModuleFlag.Val);
	}
	*Len = MFEs.size();
	return Result;
	}

	void LLVMDisposeModuleFlagsMetadata(LLVMModuleFlagEntry *Entries) {
	free(Entries);
	}

	LLVMModuleFlagBehavior
	LLVMModuleFlagEntriesGetFlagBehavior(LLVMModuleFlagEntry *Entries,
	unsigned Index) {
	LLVMOpaqueModuleFlagEntry MFE =
	static_cast<LLVMOpaqueModuleFlagEntry>(Entries[Index]);
	return MFE.Behavior;
	}

	const char LLVMModuleFlagEntriesGetKey(LLVMModuleFlagEntry Entries,
	unsigned Index, size_t *Len) {
	LLVMOpaqueModuleFlagEntry MFE =
	static_cast<LLVMOpaqueModuleFlagEntry>(Entries[Index]);
	*Len = MFE.KeyLen;
	return MFE.Key;
	}

	LLVMMetadataRef LLVMModuleFlagEntriesGetMetadata(LLVMModuleFlagEntry *Entries,
	unsigned Index) {
	LLVMOpaqueModuleFlagEntry MFE =
	static_cast<LLVMOpaqueModuleFlagEntry>(Entries[Index]);
	return MFE.Metadata;
	}

	LLVMMetadataRef LLVMGetModuleFlag(LLVMModuleRef M,
	const char *Key, size_t KeyLen) {
	return wrap(unwrap(M)->getModuleFlag({Key, KeyLen}));
	}

	void LLVMAddModuleFlag(LLVMModuleRef M, LLVMModuleFlagBehavior Behavior,
	const char *Key, size_t KeyLen,
	LLVMMetadataRef Val) {
	unwrap(M)->addModuleFlag(map_to_llvmModFlagBehavior(Behavior),
	{Key, KeyLen}, unwrap(Val));
	}

	/--.. Printing modules ....................................................--/

	void LLVMDumpModule(LLVMModuleRef M) {
	unwrap(M)->print(errs(), nullptr,
	/ShouldPreserveUseListOrder=/false, /IsForDebug=/true);
	}

	LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
	char **ErrorMessage) {
	std::error_code EC;
	raw_fd_ostream dest(Filename, EC, sys::fs::F_Text);
	if (EC) {
	*ErrorMessage = strdup(EC.message().c_str());
	return true;
	}

	unwrap(M)->print(dest, nullptr);

	dest.close();

	if (dest.has_error()) {
	std::string E = "Error printing to file: " + dest.error().message();
	*ErrorMessage = strdup(E.c_str());
	return true;
	}

	return false;
	}

	char *LLVMPrintModuleToString(LLVMModuleRef M) {
	std::string buf;
	raw_string_ostream os(buf);

	unwrap(M)->print(os, nullptr);
	os.flush();

	return strdup(buf.c_str());
	}

	/--.. Operations on inline assembler ......................................--/
	void LLVMSetModuleInlineAsm2(LLVMModuleRef M, const char *Asm, size_t Len) {
	unwrap(M)->setModuleInlineAsm(StringRef(Asm, Len));
	}

	void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) {
	unwrap(M)->setModuleInlineAsm(StringRef(Asm));
	}

	void LLVMAppendModuleInlineAsm(LLVMModuleRef M, const char *Asm, size_t Len) {
	unwrap(M)->appendModuleInlineAsm(StringRef(Asm, Len));
	}

	const char LLVMGetModuleInlineAsm(LLVMModuleRef M, size_t Len) {
	auto &Str = unwrap(M)->getModuleInlineAsm();
	*Len = Str.length();
	return Str.c_str();
	}

	LLVMValueRef LLVMGetInlineAsm(LLVMTypeRef Ty,
	char *AsmString, size_t AsmStringSize,
	char *Constraints, size_t ConstraintsSize,
	LLVMBool HasSideEffects, LLVMBool IsAlignStack,
	LLVMInlineAsmDialect Dialect) {
	InlineAsm::AsmDialect AD;
	switch (Dialect) {
	case LLVMInlineAsmDialectATT:
	AD = InlineAsm::AD_ATT;
	break;
	case LLVMInlineAsmDialectIntel:
	AD = InlineAsm::AD_Intel;
	break;
	}
	return wrap(InlineAsm::get(unwrap<FunctionType>(Ty),
	StringRef(AsmString, AsmStringSize),
	StringRef(Constraints, ConstraintsSize),
	HasSideEffects, IsAlignStack, AD));
	}


	/--.. Operations on module contexts ......................................--/
	LLVMContextRef LLVMGetModuleContext(LLVMModuleRef M) {
	return wrap(&unwrap(M)->getContext());
	}


	/===-- Operations on types -----------------------------------------------===/

	/--.. Operations on all types (mostly) ....................................--/

	LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
	switch (unwrap(Ty)->getTypeID()) {
	case Type::VoidTyID:
	return LLVMVoidTypeKind;
	case Type::HalfTyID:
	return LLVMHalfTypeKind;
	case Type::FloatTyID:
	return LLVMFloatTypeKind;
	case Type::DoubleTyID:
	return LLVMDoubleTypeKind;
	case Type::X86_FP80TyID:
	return LLVMX86_FP80TypeKind;
	case Type::FP128TyID:
	return LLVMFP128TypeKind;
	case Type::PPC_FP128TyID:
	return LLVMPPC_FP128TypeKind;
	case Type::LabelTyID:
	return LLVMLabelTypeKind;
	case Type::MetadataTyID:
	return LLVMMetadataTypeKind;
	case Type::IntegerTyID:
	return LLVMIntegerTypeKind;
	case Type::FunctionTyID:
	return LLVMFunctionTypeKind;
	case Type::StructTyID:
	return LLVMStructTypeKind;
	case Type::ArrayTyID:
	return LLVMArrayTypeKind;
	case Type::PointerTyID:
	return LLVMPointerTypeKind;
	case Type::VectorTyID:
	return LLVMVectorTypeKind;
	case Type::X86_MMXTyID:
	return LLVMX86_MMXTypeKind;
	case Type::TokenTyID:
	return LLVMTokenTypeKind;
	}
	llvm_unreachable("Unhandled TypeID.");
	}

	LLVMBool LLVMTypeIsSized(LLVMTypeRef Ty)
	{
	return unwrap(Ty)->isSized();
	}

	LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
	return wrap(&unwrap(Ty)->getContext());
	}

	void LLVMDumpType(LLVMTypeRef Ty) {
	return unwrap(Ty)->print(errs(), /IsForDebug=/true);
	}

	char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
	std::string buf;
	raw_string_ostream os(buf);

	if (unwrap(Ty))
	unwrap(Ty)->print(os);
	else
	os << "Printing <null> Type";

	os.flush();

	return strdup(buf.c_str());
	}

	/--.. Operations on integer types .........................................--/

	LLVMTypeRef LLVMInt1TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getInt1Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMInt8TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getInt8Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMInt16TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getInt16Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMInt32TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getInt32Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMInt64TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getInt64Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMInt128TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getInt128Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMIntTypeInContext(LLVMContextRef C, unsigned NumBits) {
	return wrap(IntegerType::get(*unwrap(C), NumBits));
	}

	LLVMTypeRef LLVMInt1Type(void) {
	return LLVMInt1TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMInt8Type(void) {
	return LLVMInt8TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMInt16Type(void) {
	return LLVMInt16TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMInt32Type(void) {
	return LLVMInt32TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMInt64Type(void) {
	return LLVMInt64TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMInt128Type(void) {
	return LLVMInt128TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMIntType(unsigned NumBits) {
	return LLVMIntTypeInContext(LLVMGetGlobalContext(), NumBits);
	}

	unsigned LLVMGetIntTypeWidth(LLVMTypeRef IntegerTy) {
	return unwrap<IntegerType>(IntegerTy)->getBitWidth();
	}

	/--.. Operations on real types ............................................--/

	LLVMTypeRef LLVMHalfTypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getHalfTy(*unwrap(C));
	}
	LLVMTypeRef LLVMFloatTypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getFloatTy(*unwrap(C));
	}
	LLVMTypeRef LLVMDoubleTypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getDoubleTy(*unwrap(C));
	}
	LLVMTypeRef LLVMX86FP80TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getX86_FP80Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getFP128Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C));
	}
	LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
	return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C));
	}

	LLVMTypeRef LLVMHalfType(void) {
	return LLVMHalfTypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMFloatType(void) {
	return LLVMFloatTypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMDoubleType(void) {
	return LLVMDoubleTypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMX86FP80Type(void) {
	return LLVMX86FP80TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMFP128Type(void) {
	return LLVMFP128TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMPPCFP128Type(void) {
	return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMX86MMXType(void) {
	return LLVMX86MMXTypeInContext(LLVMGetGlobalContext());
	}

	/--.. Operations on function types ........................................--/

	LLVMTypeRef LLVMFunctionType(LLVMTypeRef ReturnType,
	LLVMTypeRef *ParamTypes, unsigned ParamCount,
	LLVMBool IsVarArg) {
	ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
	return wrap(FunctionType::get(unwrap(ReturnType), Tys, IsVarArg != 0));
	}

	LLVMBool LLVMIsFunctionVarArg(LLVMTypeRef FunctionTy) {
	return unwrap<FunctionType>(FunctionTy)->isVarArg();
	}

	LLVMTypeRef LLVMGetReturnType(LLVMTypeRef FunctionTy) {
	return wrap(unwrap<FunctionType>(FunctionTy)->getReturnType());
	}

	unsigned LLVMCountParamTypes(LLVMTypeRef FunctionTy) {
	return unwrap<FunctionType>(FunctionTy)->getNumParams();
	}

	void LLVMGetParamTypes(LLVMTypeRef FunctionTy, LLVMTypeRef *Dest) {
	FunctionType *Ty = unwrap<FunctionType>(FunctionTy);
	for (FunctionType::param_iterator I = Ty->param_begin(),
	E = Ty->param_end(); I != E; ++I)
	Dest++ = wrap(I);
	}

	/--.. Operations on struct types ..........................................--/

	LLVMTypeRef LLVMStructTypeInContext(LLVMContextRef C, LLVMTypeRef *ElementTypes,
	unsigned ElementCount, LLVMBool Packed) {
	ArrayRef<Type*> Tys(unwrap(ElementTypes), ElementCount);
	return wrap(StructType::get(*unwrap(C), Tys, Packed != 0));
	}

	LLVMTypeRef LLVMStructType(LLVMTypeRef *ElementTypes,
	unsigned ElementCount, LLVMBool Packed) {
	return LLVMStructTypeInContext(LLVMGetGlobalContext(), ElementTypes,
	ElementCount, Packed);
	}

	LLVMTypeRef LLVMStructCreateNamed(LLVMContextRef C, const char *Name)
	{
	return wrap(StructType::create(*unwrap(C), Name));
	}

	const char *LLVMGetStructName(LLVMTypeRef Ty)
	{
	StructType *Type = unwrap<StructType>(Ty);
	if (!Type->hasName())
	return nullptr;
	return Type->getName().data();
	}

	void LLVMStructSetBody(LLVMTypeRef StructTy, LLVMTypeRef *ElementTypes,
	unsigned ElementCount, LLVMBool Packed) {
	ArrayRef<Type*> Tys(unwrap(ElementTypes), ElementCount);
	unwrap<StructType>(StructTy)->setBody(Tys, Packed != 0);
	}

	unsigned LLVMCountStructElementTypes(LLVMTypeRef StructTy) {
	return unwrap<StructType>(StructTy)->getNumElements();
	}

	void LLVMGetStructElementTypes(LLVMTypeRef StructTy, LLVMTypeRef *Dest) {
	StructType *Ty = unwrap<StructType>(StructTy);
	for (StructType::element_iterator I = Ty->element_begin(),
	E = Ty->element_end(); I != E; ++I)
	Dest++ = wrap(I);
	}

	LLVMTypeRef LLVMStructGetTypeAtIndex(LLVMTypeRef StructTy, unsigned i) {
	StructType *Ty = unwrap<StructType>(StructTy);
	return wrap(Ty->getTypeAtIndex(i));
	}

	LLVMBool LLVMIsPackedStruct(LLVMTypeRef StructTy) {
	return unwrap<StructType>(StructTy)->isPacked();
	}

	LLVMBool LLVMIsOpaqueStruct(LLVMTypeRef StructTy) {
	return unwrap<StructType>(StructTy)->isOpaque();
	}

	LLVMBool LLVMIsLiteralStruct(LLVMTypeRef StructTy) {
	return unwrap<StructType>(StructTy)->isLiteral();
	}

	LLVMTypeRef LLVMGetTypeByName(LLVMModuleRef M, const char *Name) {
	return wrap(unwrap(M)->getTypeByName(Name));
	}

	/--.. Operations on array, pointer, and vector types (sequence types) .....--/

	void LLVMGetSubtypes(LLVMTypeRef Tp, LLVMTypeRef *Arr) {
	int i = 0;
	for (auto *T : unwrap(Tp)->subtypes()) {
	Arr[i] = wrap(T);
	i++;
	}
	}

	LLVMTypeRef LLVMArrayType(LLVMTypeRef ElementType, unsigned ElementCount) {
	return wrap(ArrayType::get(unwrap(ElementType), ElementCount));
	}

	LLVMTypeRef LLVMPointerType(LLVMTypeRef ElementType, unsigned AddressSpace) {
	return wrap(PointerType::get(unwrap(ElementType), AddressSpace));
	}

	LLVMTypeRef LLVMVectorType(LLVMTypeRef ElementType, unsigned ElementCount) {
	return wrap(VectorType::get(unwrap(ElementType), ElementCount));
	}

	LLVMTypeRef LLVMGetElementType(LLVMTypeRef WrappedTy) {
	auto *Ty = unwrap<Type>(WrappedTy);
	if (auto *PTy = dyn_cast<PointerType>(Ty))
	return wrap(PTy->getElementType());
	return wrap(cast<SequentialType>(Ty)->getElementType());
	}

	unsigned LLVMGetNumContainedTypes(LLVMTypeRef Tp) {
	return unwrap(Tp)->getNumContainedTypes();
	}

	unsigned LLVMGetArrayLength(LLVMTypeRef ArrayTy) {
	return unwrap<ArrayType>(ArrayTy)->getNumElements();
	}

	unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy) {
	return unwrap<PointerType>(PointerTy)->getAddressSpace();
	}

	unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) {
	return unwrap<VectorType>(VectorTy)->getNumElements();
	}

	/--.. Operations on other types ...........................................--/

	LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C) {
	return wrap(Type::getVoidTy(*unwrap(C)));
	}
	LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C) {
	return wrap(Type::getLabelTy(*unwrap(C)));
	}
	LLVMTypeRef LLVMTokenTypeInContext(LLVMContextRef C) {
	return wrap(Type::getTokenTy(*unwrap(C)));
	}
	LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C) {
	return wrap(Type::getMetadataTy(*unwrap(C)));
	}

	LLVMTypeRef LLVMVoidType(void) {
	return LLVMVoidTypeInContext(LLVMGetGlobalContext());
	}
	LLVMTypeRef LLVMLabelType(void) {
	return LLVMLabelTypeInContext(LLVMGetGlobalContext());
	}

	/===-- Operations on values ----------------------------------------------===/

	/--.. Operations on all values ............................................--/

	LLVMTypeRef LLVMTypeOf(LLVMValueRef Val) {
	return wrap(unwrap(Val)->getType());
	}

	LLVMValueKind LLVMGetValueKind(LLVMValueRef Val) {
	switch(unwrap(Val)->getValueID()) {
	#define HANDLE_VALUE(Name) \
	case Value::Name##Val: \
	return LLVM##Name##ValueKind;
	#include "llvm/IR/Value.def"
	default:
	return LLVMInstructionValueKind;
	}
	}

	const char LLVMGetValueName2(LLVMValueRef Val, size_t Length) {
	auto *V = unwrap(Val);
	*Length = V->getName().size();
	return V->getName().data();
	}

	void LLVMSetValueName2(LLVMValueRef Val, const char *Name, size_t NameLen) {
	unwrap(Val)->setName(StringRef(Name, NameLen));
	}

	const char *LLVMGetValueName(LLVMValueRef Val) {
	return unwrap(Val)->getName().data();
	}

	void LLVMSetValueName(LLVMValueRef Val, const char *Name) {
	unwrap(Val)->setName(Name);
	}

	void LLVMDumpValue(LLVMValueRef Val) {
	unwrap(Val)->print(errs(), /IsForDebug=/true);
	}

	char* LLVMPrintValueToString(LLVMValueRef Val) {
	std::string buf;
	raw_string_ostream os(buf);

	if (unwrap(Val))
	unwrap(Val)->print(os);
	else
	os << "Printing <null> Value";

	os.flush();

	return strdup(buf.c_str());
	}

	void LLVMReplaceAllUsesWith(LLVMValueRef OldVal, LLVMValueRef NewVal) {
	unwrap(OldVal)->replaceAllUsesWith(unwrap(NewVal));
	}

	int LLVMHasMetadata(LLVMValueRef Inst) {
	return unwrap<Instruction>(Inst)->hasMetadata();
	}

	LLVMValueRef LLVMGetMetadata(LLVMValueRef Inst, unsigned KindID) {
	auto *I = unwrap<Instruction>(Inst);
	assert(I && "Expected instruction");
	if (auto *MD = I->getMetadata(KindID))
	return wrap(MetadataAsValue::get(I->getContext(), MD));
	return nullptr;
	}

	// MetadataAsValue uses a canonical format which strips the actual MDNode for
	// MDNode with just a single constant value, storing just a ConstantAsMetadata
	// This undoes this canonicalization, reconstructing the MDNode.
	static MDNode extractMDNode(MetadataAsValue MAV) {
	Metadata *MD = MAV->getMetadata();
	assert((isa<MDNode>(MD) \|\| isa<ConstantAsMetadata>(MD)) &&
	"Expected a metadata node or a canonicalized constant");

	if (MDNode *N = dyn_cast<MDNode>(MD))
	return N;

	return MDNode::get(MAV->getContext(), MD);
	}

	void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef Val) {
	MDNode *N = Val ? extractMDNode(unwrap<MetadataAsValue>(Val)) : nullptr;

	unwrap<Instruction>(Inst)->setMetadata(KindID, N);
	}

	struct LLVMOpaqueValueMetadataEntry {
	unsigned Kind;
	LLVMMetadataRef Metadata;
	};

	using MetadataEntries = SmallVectorImpl<std::pair<unsigned, MDNode *>>;
	static LLVMValueMetadataEntry *
	llvm_getMetadata(size_t *NumEntries,
	llvm::function_ref<void(MetadataEntries &)> AccessMD) {
	SmallVector<std::pair<unsigned, MDNode *>, 8> MVEs;
	AccessMD(MVEs);

	LLVMOpaqueValueMetadataEntry *Result =
	static_cast<LLVMOpaqueValueMetadataEntry *>(
	safe_malloc(MVEs.size() * sizeof(LLVMOpaqueValueMetadataEntry)));
	for (unsigned i = 0; i < MVEs.size(); ++i) {
	const auto &ModuleFlag = MVEs[i];
	Result[i].Kind = ModuleFlag.first;
	Result[i].Metadata = wrap(ModuleFlag.second);
	}
	*NumEntries = MVEs.size();
	return Result;
	}

	LLVMValueMetadataEntry *
	LLVMInstructionGetAllMetadataOtherThanDebugLoc(LLVMValueRef Value,
	size_t *NumEntries) {
	return llvm_getMetadata(NumEntries, [&Value](MetadataEntries &Entries) {
	unwrap<Instruction>(Value)->getAllMetadata(Entries);
	});
	}

	/--.. Conversion functions ................................................--/

	#define LLVM_DEFINE_VALUE_CAST(name) \
	LLVMValueRef LLVMIsA##name(LLVMValueRef Val) { \
	return wrap(static_cast<Value*>(dyn_cast_or_null<name>(unwrap(Val)))); \
	}

	LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DEFINE_VALUE_CAST)

	LLVMValueRef LLVMIsAMDNode(LLVMValueRef Val) {
	if (auto *MD = dyn_cast_or_null<MetadataAsValue>(unwrap(Val)))
	if (isa<MDNode>(MD->getMetadata()) \|\|
	isa<ValueAsMetadata>(MD->getMetadata()))
	return Val;
	return nullptr;
	}

	LLVMValueRef LLVMIsAMDString(LLVMValueRef Val) {
	if (auto *MD = dyn_cast_or_null<MetadataAsValue>(unwrap(Val)))
	if (isa<MDString>(MD->getMetadata()))
	return Val;
	return nullptr;
	}

	/--.. Operations on Uses ..................................................--/
	LLVMUseRef LLVMGetFirstUse(LLVMValueRef Val) {
	Value *V = unwrap(Val);
	Value::use_iterator I = V->use_begin();
	if (I == V->use_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMUseRef LLVMGetNextUse(LLVMUseRef U) {
	Use *Next = unwrap(U)->getNext();
	if (Next)
	return wrap(Next);
	return nullptr;
	}

	LLVMValueRef LLVMGetUser(LLVMUseRef U) {
	return wrap(unwrap(U)->getUser());
	}

	LLVMValueRef LLVMGetUsedValue(LLVMUseRef U) {
	return wrap(unwrap(U)->get());
	}

	/--.. Operations on Users .................................................--/

	static LLVMValueRef getMDNodeOperandImpl(LLVMContext &Context, const MDNode *N,
	unsigned Index) {
	Metadata *Op = N->getOperand(Index);
	if (!Op)
	return nullptr;
	if (auto *C = dyn_cast<ConstantAsMetadata>(Op))
	return wrap(C->getValue());
	return wrap(MetadataAsValue::get(Context, Op));
	}

	LLVMValueRef LLVMGetOperand(LLVMValueRef Val, unsigned Index) {
	Value *V = unwrap(Val);
	if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
	if (auto *L = dyn_cast<ValueAsMetadata>(MD->getMetadata())) {
	assert(Index == 0 && "Function-local metadata can only have one operand");
	return wrap(L->getValue());
	}
	return getMDNodeOperandImpl(V->getContext(),
	cast<MDNode>(MD->getMetadata()), Index);
	}

	return wrap(cast<User>(V)->getOperand(Index));
	}

	LLVMUseRef LLVMGetOperandUse(LLVMValueRef Val, unsigned Index) {
	Value *V = unwrap(Val);
	return wrap(&cast<User>(V)->getOperandUse(Index));
	}

	void LLVMSetOperand(LLVMValueRef Val, unsigned Index, LLVMValueRef Op) {
	unwrap<User>(Val)->setOperand(Index, unwrap(Op));
	}

	int LLVMGetNumOperands(LLVMValueRef Val) {
	Value *V = unwrap(Val);
	if (isa<MetadataAsValue>(V))
	return LLVMGetMDNodeNumOperands(Val);

	return cast<User>(V)->getNumOperands();
	}

	/--.. Operations on constants of any type .................................--/

	LLVMValueRef LLVMConstNull(LLVMTypeRef Ty) {
	return wrap(Constant::getNullValue(unwrap(Ty)));
	}

	LLVMValueRef LLVMConstAllOnes(LLVMTypeRef Ty) {
	return wrap(Constant::getAllOnesValue(unwrap(Ty)));
	}

	LLVMValueRef LLVMGetUndef(LLVMTypeRef Ty) {
	return wrap(UndefValue::get(unwrap(Ty)));
	}

	LLVMBool LLVMIsConstant(LLVMValueRef Ty) {
	return isa<Constant>(unwrap(Ty));
	}

	LLVMBool LLVMIsNull(LLVMValueRef Val) {
	if (Constant *C = dyn_cast<Constant>(unwrap(Val)))
	return C->isNullValue();
	return false;
	}

	LLVMBool LLVMIsUndef(LLVMValueRef Val) {
	return isa<UndefValue>(unwrap(Val));
	}

	LLVMValueRef LLVMConstPointerNull(LLVMTypeRef Ty) {
	return wrap(ConstantPointerNull::get(unwrap<PointerType>(Ty)));
	}

	/--.. Operations on metadata nodes ........................................--/

	LLVMMetadataRef LLVMMDStringInContext2(LLVMContextRef C, const char *Str,
	size_t SLen) {
	return wrap(MDString::get(*unwrap(C), StringRef(Str, SLen)));
	}

	LLVMMetadataRef LLVMMDNodeInContext2(LLVMContextRef C, LLVMMetadataRef *MDs,
	size_t Count) {
	return wrap(MDNode::get(unwrap(C), ArrayRef<Metadata>(unwrap(MDs), Count)));
	}

	LLVMValueRef LLVMMDStringInContext(LLVMContextRef C, const char *Str,
	unsigned SLen) {
	LLVMContext &Context = *unwrap(C);
	return wrap(MetadataAsValue::get(
	Context, MDString::get(Context, StringRef(Str, SLen))));
	}

	LLVMValueRef LLVMMDString(const char *Str, unsigned SLen) {
	return LLVMMDStringInContext(LLVMGetGlobalContext(), Str, SLen);
	}

	LLVMValueRef LLVMMDNodeInContext(LLVMContextRef C, LLVMValueRef *Vals,
	unsigned Count) {
	LLVMContext &Context = *unwrap(C);
	SmallVector<Metadata *, 8> MDs;
	for (auto *OV : makeArrayRef(Vals, Count)) {
	Value *V = unwrap(OV);
	Metadata *MD;
	if (!V)
	MD = nullptr;
	else if (auto *C = dyn_cast<Constant>(V))
	MD = ConstantAsMetadata::get(C);
	else if (auto *MDV = dyn_cast<MetadataAsValue>(V)) {
	MD = MDV->getMetadata();
	assert(!isa<LocalAsMetadata>(MD) && "Unexpected function-local metadata "
	"outside of direct argument to call");
	} else {
	// This is function-local metadata. Pretend to make an MDNode.
	assert(Count == 1 &&
	"Expected only one operand to function-local metadata");
	return wrap(MetadataAsValue::get(Context, LocalAsMetadata::get(V)));
	}

	MDs.push_back(MD);
	}
	return wrap(MetadataAsValue::get(Context, MDNode::get(Context, MDs)));
	}

	LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) {
	return LLVMMDNodeInContext(LLVMGetGlobalContext(), Vals, Count);
	}

	LLVMValueRef LLVMMetadataAsValue(LLVMContextRef C, LLVMMetadataRef MD) {
	return wrap(MetadataAsValue::get(*unwrap(C), unwrap(MD)));
	}

	LLVMMetadataRef LLVMValueAsMetadata(LLVMValueRef Val) {
	auto *V = unwrap(Val);
	if (auto *C = dyn_cast<Constant>(V))
	return wrap(ConstantAsMetadata::get(C));
	if (auto *MAV = dyn_cast<MetadataAsValue>(V))
	return wrap(MAV->getMetadata());
	return wrap(ValueAsMetadata::get(V));
	}

	const char LLVMGetMDString(LLVMValueRef V, unsigned Length) {
	if (const auto *MD = dyn_cast<MetadataAsValue>(unwrap(V)))
	if (const MDString *S = dyn_cast<MDString>(MD->getMetadata())) {
	*Length = S->getString().size();
	return S->getString().data();
	}
	*Length = 0;
	return nullptr;
	}

	unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V) {
	auto *MD = cast<MetadataAsValue>(unwrap(V));
	if (isa<ValueAsMetadata>(MD->getMetadata()))
	return 1;
	return cast<MDNode>(MD->getMetadata())->getNumOperands();
	}

	LLVMNamedMDNodeRef LLVMGetFirstNamedMetadata(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::named_metadata_iterator I = Mod->named_metadata_begin();
	if (I == Mod->named_metadata_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMNamedMDNodeRef LLVMGetLastNamedMetadata(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::named_metadata_iterator I = Mod->named_metadata_end();
	if (I == Mod->named_metadata_begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMNamedMDNodeRef LLVMGetNextNamedMetadata(LLVMNamedMDNodeRef NMD) {
	NamedMDNode *NamedNode = unwrap<NamedMDNode>(NMD);
	Module::named_metadata_iterator I(NamedNode);
	if (++I == NamedNode->getParent()->named_metadata_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMNamedMDNodeRef LLVMGetPreviousNamedMetadata(LLVMNamedMDNodeRef NMD) {
	NamedMDNode *NamedNode = unwrap<NamedMDNode>(NMD);
	Module::named_metadata_iterator I(NamedNode);
	if (I == NamedNode->getParent()->named_metadata_begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMNamedMDNodeRef LLVMGetNamedMetadata(LLVMModuleRef M,
	const char *Name, size_t NameLen) {
	return wrap(unwrap(M)->getNamedMetadata(StringRef(Name, NameLen)));
	}

	LLVMNamedMDNodeRef LLVMGetOrInsertNamedMetadata(LLVMModuleRef M,
	const char *Name, size_t NameLen) {
	return wrap(unwrap(M)->getOrInsertNamedMetadata({Name, NameLen}));
	}

	const char LLVMGetNamedMetadataName(LLVMNamedMDNodeRef NMD, size_t NameLen) {
	NamedMDNode *NamedNode = unwrap<NamedMDNode>(NMD);
	*NameLen = NamedNode->getName().size();
	return NamedNode->getName().data();
	}

	void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest) {
	auto *MD = cast<MetadataAsValue>(unwrap(V));
	if (auto *MDV = dyn_cast<ValueAsMetadata>(MD->getMetadata())) {
	*Dest = wrap(MDV->getValue());
	return;
	}
	const auto *N = cast<MDNode>(MD->getMetadata());
	const unsigned numOperands = N->getNumOperands();
	LLVMContext &Context = unwrap(V)->getContext();
	for (unsigned i = 0; i < numOperands; i++)
	Dest[i] = getMDNodeOperandImpl(Context, N, i);
	}

	unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char *Name) {
	if (NamedMDNode *N = unwrap(M)->getNamedMetadata(Name)) {
	return N->getNumOperands();
	}
	return 0;
	}

	void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char *Name,
	LLVMValueRef *Dest) {
	NamedMDNode *N = unwrap(M)->getNamedMetadata(Name);
	if (!N)
	return;
	LLVMContext &Context = unwrap(M)->getContext();
	for (unsigned i=0;i<N->getNumOperands();i++)
	Dest[i] = wrap(MetadataAsValue::get(Context, N->getOperand(i)));
	}

	void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name,
	LLVMValueRef Val) {
	NamedMDNode *N = unwrap(M)->getOrInsertNamedMetadata(Name);
	if (!N)
	return;
	if (!Val)
	return;
	N->addOperand(extractMDNode(unwrap<MetadataAsValue>(Val)));
	}

	const char LLVMGetDebugLocDirectory(LLVMValueRef Val, unsigned Length) {
	if (!Length) return nullptr;
	StringRef S;
	if (const auto *I = dyn_cast<Instruction>(unwrap(Val))) {
	if (const auto &DL = I->getDebugLoc()) {
	S = DL->getDirectory();
	}
	} else if (const auto *GV = dyn_cast<GlobalVariable>(unwrap(Val))) {
	SmallVector<DIGlobalVariableExpression *, 1> GVEs;
	GV->getDebugInfo(GVEs);
	if (GVEs.size())
	if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
	S = DGV->getDirectory();
	} else if (const auto *F = dyn_cast<Function>(unwrap(Val))) {
	if (const DISubprogram *DSP = F->getSubprogram())
	S = DSP->getDirectory();
	} else {
	assert(0 && "Expected Instruction, GlobalVariable or Function");
	return nullptr;
	}
	*Length = S.size();
	return S.data();
	}

	const char LLVMGetDebugLocFilename(LLVMValueRef Val, unsigned Length) {
	if (!Length) return nullptr;
	StringRef S;
	if (const auto *I = dyn_cast<Instruction>(unwrap(Val))) {
	if (const auto &DL = I->getDebugLoc()) {
	S = DL->getFilename();
	}
	} else if (const auto *GV = dyn_cast<GlobalVariable>(unwrap(Val))) {
	SmallVector<DIGlobalVariableExpression *, 1> GVEs;
	GV->getDebugInfo(GVEs);
	if (GVEs.size())
	if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
	S = DGV->getFilename();
	} else if (const auto *F = dyn_cast<Function>(unwrap(Val))) {
	if (const DISubprogram *DSP = F->getSubprogram())
	S = DSP->getFilename();
	} else {
	assert(0 && "Expected Instruction, GlobalVariable or Function");
	return nullptr;
	}
	*Length = S.size();
	return S.data();
	}

	unsigned LLVMGetDebugLocLine(LLVMValueRef Val) {
	unsigned L = 0;
	if (const auto *I = dyn_cast<Instruction>(unwrap(Val))) {
	if (const auto &DL = I->getDebugLoc()) {
	L = DL->getLine();
	}
	} else if (const auto *GV = dyn_cast<GlobalVariable>(unwrap(Val))) {
	SmallVector<DIGlobalVariableExpression *, 1> GVEs;
	GV->getDebugInfo(GVEs);
	if (GVEs.size())
	if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
	L = DGV->getLine();
	} else if (const auto *F = dyn_cast<Function>(unwrap(Val))) {
	if (const DISubprogram *DSP = F->getSubprogram())
	L = DSP->getLine();
	} else {
	assert(0 && "Expected Instruction, GlobalVariable or Function");
	return -1;
	}
	return L;
	}

	unsigned LLVMGetDebugLocColumn(LLVMValueRef Val) {
	unsigned C = 0;
	if (const auto *I = dyn_cast<Instruction>(unwrap(Val)))
	if (const auto &DL = I->getDebugLoc())
	C = DL->getColumn();
	return C;
	}

	/--.. Operations on scalar constants ......................................--/

	LLVMValueRef LLVMConstInt(LLVMTypeRef IntTy, unsigned long long N,
	LLVMBool SignExtend) {
	return wrap(ConstantInt::get(unwrap<IntegerType>(IntTy), N, SignExtend != 0));
	}

	LLVMValueRef LLVMConstIntOfArbitraryPrecision(LLVMTypeRef IntTy,
	unsigned NumWords,
	const uint64_t Words[]) {
	IntegerType *Ty = unwrap<IntegerType>(IntTy);
	return wrap(ConstantInt::get(Ty->getContext(),
	APInt(Ty->getBitWidth(),
	makeArrayRef(Words, NumWords))));
	}

	LLVMValueRef LLVMConstIntOfString(LLVMTypeRef IntTy, const char Str[],
	uint8_t Radix) {
	return wrap(ConstantInt::get(unwrap<IntegerType>(IntTy), StringRef(Str),
	Radix));
	}

	LLVMValueRef LLVMConstIntOfStringAndSize(LLVMTypeRef IntTy, const char Str[],
	unsigned SLen, uint8_t Radix) {
	return wrap(ConstantInt::get(unwrap<IntegerType>(IntTy), StringRef(Str, SLen),
	Radix));
	}

	LLVMValueRef LLVMConstReal(LLVMTypeRef RealTy, double N) {
	return wrap(ConstantFP::get(unwrap(RealTy), N));
	}

	LLVMValueRef LLVMConstRealOfString(LLVMTypeRef RealTy, const char *Text) {
	return wrap(ConstantFP::get(unwrap(RealTy), StringRef(Text)));
	}

	LLVMValueRef LLVMConstRealOfStringAndSize(LLVMTypeRef RealTy, const char Str[],
	unsigned SLen) {
	return wrap(ConstantFP::get(unwrap(RealTy), StringRef(Str, SLen)));
	}

	unsigned long long LLVMConstIntGetZExtValue(LLVMValueRef ConstantVal) {
	return unwrap<ConstantInt>(ConstantVal)->getZExtValue();
	}

	long long LLVMConstIntGetSExtValue(LLVMValueRef ConstantVal) {
	return unwrap<ConstantInt>(ConstantVal)->getSExtValue();
	}

	double LLVMConstRealGetDouble(LLVMValueRef ConstantVal, LLVMBool *LosesInfo) {
	ConstantFP *cFP = unwrap<ConstantFP>(ConstantVal) ;
	Type *Ty = cFP->getType();

	if (Ty->isFloatTy()) {
	*LosesInfo = false;
	return cFP->getValueAPF().convertToFloat();
	}

	if (Ty->isDoubleTy()) {
	*LosesInfo = false;
	return cFP->getValueAPF().convertToDouble();
	}

	bool APFLosesInfo;
	APFloat APF = cFP->getValueAPF();
	APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &APFLosesInfo);
	*LosesInfo = APFLosesInfo;
	return APF.convertToDouble();
	}

	/--.. Operations on composite constants ...................................--/

	LLVMValueRef LLVMConstStringInContext(LLVMContextRef C, const char *Str,
	unsigned Length,
	LLVMBool DontNullTerminate) {
	/* Inverted the sense of AddNull because ', 0)' is a
	better mnemonic for null termination than ', 1)'. */
	return wrap(ConstantDataArray::getString(*unwrap(C), StringRef(Str, Length),
	DontNullTerminate == 0));
	}

	LLVMValueRef LLVMConstString(const char *Str, unsigned Length,
	LLVMBool DontNullTerminate) {
	return LLVMConstStringInContext(LLVMGetGlobalContext(), Str, Length,
	DontNullTerminate);
	}

	LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef C, unsigned idx) {
	return wrap(unwrap<ConstantDataSequential>(C)->getElementAsConstant(idx));
	}

	LLVMBool LLVMIsConstantString(LLVMValueRef C) {
	return unwrap<ConstantDataSequential>(C)->isString();
	}

	const char LLVMGetAsString(LLVMValueRef C, size_t Length) {
	StringRef Str = unwrap<ConstantDataSequential>(C)->getAsString();
	*Length = Str.size();
	return Str.data();
	}

	LLVMValueRef LLVMConstArray(LLVMTypeRef ElementTy,
	LLVMValueRef *ConstantVals, unsigned Length) {
	ArrayRef<Constant*> V(unwrap<Constant>(ConstantVals, Length), Length);
	return wrap(ConstantArray::get(ArrayType::get(unwrap(ElementTy), Length), V));
	}

	LLVMValueRef LLVMConstStructInContext(LLVMContextRef C,
	LLVMValueRef *ConstantVals,
	unsigned Count, LLVMBool Packed) {
	Constant **Elements = unwrap<Constant>(ConstantVals, Count);
	return wrap(ConstantStruct::getAnon(*unwrap(C), makeArrayRef(Elements, Count),
	Packed != 0));
	}

	LLVMValueRef LLVMConstStruct(LLVMValueRef *ConstantVals, unsigned Count,
	LLVMBool Packed) {
	return LLVMConstStructInContext(LLVMGetGlobalContext(), ConstantVals, Count,
	Packed);
	}

	LLVMValueRef LLVMConstNamedStruct(LLVMTypeRef StructTy,
	LLVMValueRef *ConstantVals,
	unsigned Count) {
	Constant **Elements = unwrap<Constant>(ConstantVals, Count);
	StructType *Ty = cast<StructType>(unwrap(StructTy));

	return wrap(ConstantStruct::get(Ty, makeArrayRef(Elements, Count)));
	}

	LLVMValueRef LLVMConstVector(LLVMValueRef *ScalarConstantVals, unsigned Size) {
	return wrap(ConstantVector::get(makeArrayRef(
	unwrap<Constant>(ScalarConstantVals, Size), Size)));
	}

	/-- Opcode mapping /

	static LLVMOpcode map_to_llvmopcode(int opcode)
	{
	switch (opcode) {
	default: llvm_unreachable("Unhandled Opcode.");
	#define HANDLE_INST(num, opc, clas) case num: return LLVM##opc;
	#include "llvm/IR/Instruction.def"
	#undef HANDLE_INST
	}
	}

	static int map_from_llvmopcode(LLVMOpcode code)
	{
	switch (code) {
	#define HANDLE_INST(num, opc, clas) case LLVM##opc: return num;
	#include "llvm/IR/Instruction.def"
	#undef HANDLE_INST
	}
	llvm_unreachable("Unhandled Opcode.");
	}

	/--.. Constant expressions ................................................--/

	LLVMOpcode LLVMGetConstOpcode(LLVMValueRef ConstantVal) {
	return map_to_llvmopcode(unwrap<ConstantExpr>(ConstantVal)->getOpcode());
	}

	LLVMValueRef LLVMAlignOf(LLVMTypeRef Ty) {
	return wrap(ConstantExpr::getAlignOf(unwrap(Ty)));
	}

	LLVMValueRef LLVMSizeOf(LLVMTypeRef Ty) {
	return wrap(ConstantExpr::getSizeOf(unwrap(Ty)));
	}

	LLVMValueRef LLVMConstNeg(LLVMValueRef ConstantVal) {
	return wrap(ConstantExpr::getNeg(unwrap<Constant>(ConstantVal)));
	}

	LLVMValueRef LLVMConstNSWNeg(LLVMValueRef ConstantVal) {
	return wrap(ConstantExpr::getNSWNeg(unwrap<Constant>(ConstantVal)));
	}

	LLVMValueRef LLVMConstNUWNeg(LLVMValueRef ConstantVal) {
	return wrap(ConstantExpr::getNUWNeg(unwrap<Constant>(ConstantVal)));
	}


	LLVMValueRef LLVMConstFNeg(LLVMValueRef ConstantVal) {
	return wrap(ConstantExpr::getFNeg(unwrap<Constant>(ConstantVal)));
	}

	LLVMValueRef LLVMConstNot(LLVMValueRef ConstantVal) {
	return wrap(ConstantExpr::getNot(unwrap<Constant>(ConstantVal)));
	}

	LLVMValueRef LLVMConstAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getAdd(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstNSWAdd(LLVMValueRef LHSConstant,
	LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getNSWAdd(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstNUWAdd(LLVMValueRef LHSConstant,
	LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getNUWAdd(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstFAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getFAdd(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getSub(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstNSWSub(LLVMValueRef LHSConstant,
	LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getNSWSub(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstNUWSub(LLVMValueRef LHSConstant,
	LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getNUWSub(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstFSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getFSub(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getMul(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstNSWMul(LLVMValueRef LHSConstant,
	LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getNSWMul(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstNUWMul(LLVMValueRef LHSConstant,
	LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getNUWMul(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstFMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getFMul(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstUDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getUDiv(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstExactUDiv(LLVMValueRef LHSConstant,
	LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getExactUDiv(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstSDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getSDiv(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstExactSDiv(LLVMValueRef LHSConstant,
	LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getExactSDiv(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstFDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getFDiv(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstURem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getURem(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstSRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getSRem(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstFRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getFRem(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstAnd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getAnd(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstOr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getOr(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstXor(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getXor(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstICmp(LLVMIntPredicate Predicate,
	LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getICmp(Predicate,
	unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstFCmp(LLVMRealPredicate Predicate,
	LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getFCmp(Predicate,
	unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstShl(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getShl(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstLShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getLShr(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstAShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
	return wrap(ConstantExpr::getAShr(unwrap<Constant>(LHSConstant),
	unwrap<Constant>(RHSConstant)));
	}

	LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal,
	LLVMValueRef *ConstantIndices, unsigned NumIndices) {
	ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
	NumIndices);
	Constant *Val = unwrap<Constant>(ConstantVal);
	Type *Ty =
	cast<PointerType>(Val->getType()->getScalarType())->getElementType();
	return wrap(ConstantExpr::getGetElementPtr(Ty, Val, IdxList));
	}

	LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal,
	LLVMValueRef *ConstantIndices,
	unsigned NumIndices) {
	ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
	NumIndices);
	Constant *Val = unwrap<Constant>(ConstantVal);
	Type *Ty =
	cast<PointerType>(Val->getType()->getScalarType())->getElementType();
	return wrap(ConstantExpr::getInBoundsGetElementPtr(Ty, Val, IdxList));
	}

	LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getTrunc(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstSExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getSExt(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstZExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getZExt(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstFPTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getFPTrunc(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstFPExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getFPExtend(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstUIToFP(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getUIToFP(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstSIToFP(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getSIToFP(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstFPToUI(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getFPToUI(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstFPToSI(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getFPToSI(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstPtrToInt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getPtrToInt(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstIntToPtr(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getIntToPtr(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getBitCast(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstAddrSpaceCast(LLVMValueRef ConstantVal,
	LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getAddrSpaceCast(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstZExtOrBitCast(LLVMValueRef ConstantVal,
	LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getZExtOrBitCast(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstSExtOrBitCast(LLVMValueRef ConstantVal,
	LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getSExtOrBitCast(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstTruncOrBitCast(LLVMValueRef ConstantVal,
	LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getTruncOrBitCast(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstPointerCast(LLVMValueRef ConstantVal,
	LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getPointerCast(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstIntCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType,
	LLVMBool isSigned) {
	return wrap(ConstantExpr::getIntegerCast(unwrap<Constant>(ConstantVal),
	unwrap(ToType), isSigned));
	}

	LLVMValueRef LLVMConstFPCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
	return wrap(ConstantExpr::getFPCast(unwrap<Constant>(ConstantVal),
	unwrap(ToType)));
	}

	LLVMValueRef LLVMConstSelect(LLVMValueRef ConstantCondition,
	LLVMValueRef ConstantIfTrue,
	LLVMValueRef ConstantIfFalse) {
	return wrap(ConstantExpr::getSelect(unwrap<Constant>(ConstantCondition),
	unwrap<Constant>(ConstantIfTrue),
	unwrap<Constant>(ConstantIfFalse)));
	}

	LLVMValueRef LLVMConstExtractElement(LLVMValueRef VectorConstant,
	LLVMValueRef IndexConstant) {
	return wrap(ConstantExpr::getExtractElement(unwrap<Constant>(VectorConstant),
	unwrap<Constant>(IndexConstant)));
	}

	LLVMValueRef LLVMConstInsertElement(LLVMValueRef VectorConstant,
	LLVMValueRef ElementValueConstant,
	LLVMValueRef IndexConstant) {
	return wrap(ConstantExpr::getInsertElement(unwrap<Constant>(VectorConstant),
	unwrap<Constant>(ElementValueConstant),
	unwrap<Constant>(IndexConstant)));
	}

	LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
	LLVMValueRef VectorBConstant,
	LLVMValueRef MaskConstant) {
	return wrap(ConstantExpr::getShuffleVector(unwrap<Constant>(VectorAConstant),
	unwrap<Constant>(VectorBConstant),
	unwrap<Constant>(MaskConstant)));
	}

	LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
	unsigned NumIdx) {
	return wrap(ConstantExpr::getExtractValue(unwrap<Constant>(AggConstant),
	makeArrayRef(IdxList, NumIdx)));
	}

	LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
	LLVMValueRef ElementValueConstant,
	unsigned *IdxList, unsigned NumIdx) {
	return wrap(ConstantExpr::getInsertValue(unwrap<Constant>(AggConstant),
	unwrap<Constant>(ElementValueConstant),
	makeArrayRef(IdxList, NumIdx)));
	}

	LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty, const char *AsmString,
	const char *Constraints,
	LLVMBool HasSideEffects,
	LLVMBool IsAlignStack) {
	return wrap(InlineAsm::get(dyn_cast<FunctionType>(unwrap(Ty)), AsmString,
	Constraints, HasSideEffects, IsAlignStack));
	}

	LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB) {
	return wrap(BlockAddress::get(unwrap<Function>(F), unwrap(BB)));
	}

	/--.. Operations on global variables, functions, and aliases (globals) ....--/

	LLVMModuleRef LLVMGetGlobalParent(LLVMValueRef Global) {
	return wrap(unwrap<GlobalValue>(Global)->getParent());
	}

	LLVMBool LLVMIsDeclaration(LLVMValueRef Global) {
	return unwrap<GlobalValue>(Global)->isDeclaration();
	}

	LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
	switch (unwrap<GlobalValue>(Global)->getLinkage()) {
	case GlobalValue::ExternalLinkage:
	return LLVMExternalLinkage;
	case GlobalValue::AvailableExternallyLinkage:
	return LLVMAvailableExternallyLinkage;
	case GlobalValue::LinkOnceAnyLinkage:
	return LLVMLinkOnceAnyLinkage;
	case GlobalValue::LinkOnceODRLinkage:
	return LLVMLinkOnceODRLinkage;
	case GlobalValue::WeakAnyLinkage:
	return LLVMWeakAnyLinkage;
	case GlobalValue::WeakODRLinkage:
	return LLVMWeakODRLinkage;
	case GlobalValue::AppendingLinkage:
	return LLVMAppendingLinkage;
	case GlobalValue::InternalLinkage:
	return LLVMInternalLinkage;
	case GlobalValue::PrivateLinkage:
	return LLVMPrivateLinkage;
	case GlobalValue::ExternalWeakLinkage:
	return LLVMExternalWeakLinkage;
	case GlobalValue::CommonLinkage:
	return LLVMCommonLinkage;
	}

	llvm_unreachable("Invalid GlobalValue linkage!");
	}

	void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
	GlobalValue *GV = unwrap<GlobalValue>(Global);

	switch (Linkage) {
	case LLVMExternalLinkage:
	GV->setLinkage(GlobalValue::ExternalLinkage);
	break;
	case LLVMAvailableExternallyLinkage:
	GV->setLinkage(GlobalValue::AvailableExternallyLinkage);
	break;
	case LLVMLinkOnceAnyLinkage:
	GV->setLinkage(GlobalValue::LinkOnceAnyLinkage);
	break;
	case LLVMLinkOnceODRLinkage:
	GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
	break;
	case LLVMLinkOnceODRAutoHideLinkage:
	LLVM_DEBUG(
	errs() << "LLVMSetLinkage(): LLVMLinkOnceODRAutoHideLinkage is no "
	"longer supported.");
	break;
	case LLVMWeakAnyLinkage:
	GV->setLinkage(GlobalValue::WeakAnyLinkage);
	break;
	case LLVMWeakODRLinkage:
	GV->setLinkage(GlobalValue::WeakODRLinkage);
	break;
	case LLVMAppendingLinkage:
	GV->setLinkage(GlobalValue::AppendingLinkage);
	break;
	case LLVMInternalLinkage:
	GV->setLinkage(GlobalValue::InternalLinkage);
	break;
	case LLVMPrivateLinkage:
	GV->setLinkage(GlobalValue::PrivateLinkage);
	break;
	case LLVMLinkerPrivateLinkage:
	GV->setLinkage(GlobalValue::PrivateLinkage);
	break;
	case LLVMLinkerPrivateWeakLinkage:
	GV->setLinkage(GlobalValue::PrivateLinkage);
	break;
	case LLVMDLLImportLinkage:
	LLVM_DEBUG(
	errs()
	<< "LLVMSetLinkage(): LLVMDLLImportLinkage is no longer supported.");
	break;
	case LLVMDLLExportLinkage:
	LLVM_DEBUG(
	errs()
	<< "LLVMSetLinkage(): LLVMDLLExportLinkage is no longer supported.");
	break;
	case LLVMExternalWeakLinkage:
	GV->setLinkage(GlobalValue::ExternalWeakLinkage);
	break;
	case LLVMGhostLinkage:
	LLVM_DEBUG(
	errs() << "LLVMSetLinkage(): LLVMGhostLinkage is no longer supported.");
	break;
	case LLVMCommonLinkage:
	GV->setLinkage(GlobalValue::CommonLinkage);
	break;
	}
	}

	const char *LLVMGetSection(LLVMValueRef Global) {
	// Using .data() is safe because of how GlobalObject::setSection is
	// implemented.
	return unwrap<GlobalValue>(Global)->getSection().data();
	}

	void LLVMSetSection(LLVMValueRef Global, const char *Section) {
	unwrap<GlobalObject>(Global)->setSection(Section);
	}

	LLVMVisibility LLVMGetVisibility(LLVMValueRef Global) {
	return static_cast<LLVMVisibility>(
	unwrap<GlobalValue>(Global)->getVisibility());
	}

	void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz) {
	unwrap<GlobalValue>(Global)
	->setVisibility(static_cast<GlobalValue::VisibilityTypes>(Viz));
	}

	LLVMDLLStorageClass LLVMGetDLLStorageClass(LLVMValueRef Global) {
	return static_cast<LLVMDLLStorageClass>(
	unwrap<GlobalValue>(Global)->getDLLStorageClass());
	}

	void LLVMSetDLLStorageClass(LLVMValueRef Global, LLVMDLLStorageClass Class) {
	unwrap<GlobalValue>(Global)->setDLLStorageClass(
	static_cast<GlobalValue::DLLStorageClassTypes>(Class));
	}

	LLVMUnnamedAddr LLVMGetUnnamedAddress(LLVMValueRef Global) {
	switch (unwrap<GlobalValue>(Global)->getUnnamedAddr()) {
	case GlobalVariable::UnnamedAddr::None:
	return LLVMNoUnnamedAddr;
	case GlobalVariable::UnnamedAddr::Local:
	return LLVMLocalUnnamedAddr;
	case GlobalVariable::UnnamedAddr::Global:
	return LLVMGlobalUnnamedAddr;
	}
	llvm_unreachable("Unknown UnnamedAddr kind!");
	}

	void LLVMSetUnnamedAddress(LLVMValueRef Global, LLVMUnnamedAddr UnnamedAddr) {
	GlobalValue *GV = unwrap<GlobalValue>(Global);

	switch (UnnamedAddr) {
	case LLVMNoUnnamedAddr:
	return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::None);
	case LLVMLocalUnnamedAddr:
	return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local);
	case LLVMGlobalUnnamedAddr:
	return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::Global);
	}
	}

	LLVMBool LLVMHasUnnamedAddr(LLVMValueRef Global) {
	return unwrap<GlobalValue>(Global)->hasGlobalUnnamedAddr();
	}

	void LLVMSetUnnamedAddr(LLVMValueRef Global, LLVMBool HasUnnamedAddr) {
	unwrap<GlobalValue>(Global)->setUnnamedAddr(
	HasUnnamedAddr ? GlobalValue::UnnamedAddr::Global
	: GlobalValue::UnnamedAddr::None);
	}

	LLVMTypeRef LLVMGlobalGetValueType(LLVMValueRef Global) {
	return wrap(unwrap<GlobalValue>(Global)->getValueType());
	}

	/--.. Operations on global variables, load and store instructions .........--/

	unsigned LLVMGetAlignment(LLVMValueRef V) {
	Value *P = unwrap<Value>(V);
	if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
	return GV->getAlignment();
	if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
	return AI->getAlignment();
	if (LoadInst *LI = dyn_cast<LoadInst>(P))
	return LI->getAlignment();
	if (StoreInst *SI = dyn_cast<StoreInst>(P))
	return SI->getAlignment();

	llvm_unreachable(
	"only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment");
	}

	void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
	Value *P = unwrap<Value>(V);
	if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
	GV->setAlignment(Bytes);
	else if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
	AI->setAlignment(Bytes);
	else if (LoadInst *LI = dyn_cast<LoadInst>(P))
	LI->setAlignment(Bytes);
	else if (StoreInst *SI = dyn_cast<StoreInst>(P))
	SI->setAlignment(Bytes);
	else
	llvm_unreachable(
	"only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment");
	}

	LLVMValueMetadataEntry *LLVMGlobalCopyAllMetadata(LLVMValueRef Value,
	size_t *NumEntries) {
	return llvm_getMetadata(NumEntries, [&Value](MetadataEntries &Entries) {
	if (Instruction *Instr = dyn_cast<Instruction>(unwrap(Value))) {
	Instr->getAllMetadata(Entries);
	} else {
	unwrap<GlobalObject>(Value)->getAllMetadata(Entries);
	}
	});
	}

	unsigned LLVMValueMetadataEntriesGetKind(LLVMValueMetadataEntry *Entries,
	unsigned Index) {
	LLVMOpaqueValueMetadataEntry MVE =
	static_cast<LLVMOpaqueValueMetadataEntry>(Entries[Index]);
	return MVE.Kind;
	}

	LLVMMetadataRef
	LLVMValueMetadataEntriesGetMetadata(LLVMValueMetadataEntry *Entries,
	unsigned Index) {
	LLVMOpaqueValueMetadataEntry MVE =
	static_cast<LLVMOpaqueValueMetadataEntry>(Entries[Index]);
	return MVE.Metadata;
	}

	void LLVMDisposeValueMetadataEntries(LLVMValueMetadataEntry *Entries) {
	free(Entries);
	}

	void LLVMGlobalSetMetadata(LLVMValueRef Global, unsigned Kind,
	LLVMMetadataRef MD) {
	unwrap<GlobalObject>(Global)->setMetadata(Kind, unwrap<MDNode>(MD));
	}

	void LLVMGlobalEraseMetadata(LLVMValueRef Global, unsigned Kind) {
	unwrap<GlobalObject>(Global)->eraseMetadata(Kind);
	}

	void LLVMGlobalClearMetadata(LLVMValueRef Global) {
	unwrap<GlobalObject>(Global)->clearMetadata();
	}

	/--.. Operations on global variables ......................................--/

	LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) {
	return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false,
	GlobalValue::ExternalLinkage, nullptr, Name));
	}

	LLVMValueRef LLVMAddGlobalInAddressSpace(LLVMModuleRef M, LLVMTypeRef Ty,
	const char *Name,
	unsigned AddressSpace) {
	return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false,
	GlobalValue::ExternalLinkage, nullptr, Name,
	nullptr, GlobalVariable::NotThreadLocal,
	AddressSpace));
	}

	LLVMValueRef LLVMGetNamedGlobal(LLVMModuleRef M, const char *Name) {
	return wrap(unwrap(M)->getNamedGlobal(Name));
	}

	LLVMValueRef LLVMGetFirstGlobal(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::global_iterator I = Mod->global_begin();
	if (I == Mod->global_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::global_iterator I = Mod->global_end();
	if (I == Mod->global_begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMValueRef LLVMGetNextGlobal(LLVMValueRef GlobalVar) {
	GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
	Module::global_iterator I(GV);
	if (++I == GV->getParent()->global_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetPreviousGlobal(LLVMValueRef GlobalVar) {
	GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
	Module::global_iterator I(GV);
	if (I == GV->getParent()->global_begin())
	return nullptr;
	return wrap(&*--I);
	}

	void LLVMDeleteGlobal(LLVMValueRef GlobalVar) {
	unwrap<GlobalVariable>(GlobalVar)->eraseFromParent();
	}

	LLVMValueRef LLVMGetInitializer(LLVMValueRef GlobalVar) {
	GlobalVariable* GV = unwrap<GlobalVariable>(GlobalVar);
	if ( !GV->hasInitializer() )
	return nullptr;
	return wrap(GV->getInitializer());
	}

	void LLVMSetInitializer(LLVMValueRef GlobalVar, LLVMValueRef ConstantVal) {
	unwrap<GlobalVariable>(GlobalVar)
	->setInitializer(unwrap<Constant>(ConstantVal));
	}

	LLVMBool LLVMIsThreadLocal(LLVMValueRef GlobalVar) {
	return unwrap<GlobalVariable>(GlobalVar)->isThreadLocal();
	}

	void LLVMSetThreadLocal(LLVMValueRef GlobalVar, LLVMBool IsThreadLocal) {
	unwrap<GlobalVariable>(GlobalVar)->setThreadLocal(IsThreadLocal != 0);
	}

	LLVMBool LLVMIsGlobalConstant(LLVMValueRef GlobalVar) {
	return unwrap<GlobalVariable>(GlobalVar)->isConstant();
	}

	void LLVMSetGlobalConstant(LLVMValueRef GlobalVar, LLVMBool IsConstant) {
	unwrap<GlobalVariable>(GlobalVar)->setConstant(IsConstant != 0);
	}

	LLVMThreadLocalMode LLVMGetThreadLocalMode(LLVMValueRef GlobalVar) {
	switch (unwrap<GlobalVariable>(GlobalVar)->getThreadLocalMode()) {
	case GlobalVariable::NotThreadLocal:
	return LLVMNotThreadLocal;
	case GlobalVariable::GeneralDynamicTLSModel:
	return LLVMGeneralDynamicTLSModel;
	case GlobalVariable::LocalDynamicTLSModel:
	return LLVMLocalDynamicTLSModel;
	case GlobalVariable::InitialExecTLSModel:
	return LLVMInitialExecTLSModel;
	case GlobalVariable::LocalExecTLSModel:
	return LLVMLocalExecTLSModel;
	}

	llvm_unreachable("Invalid GlobalVariable thread local mode");
	}

	void LLVMSetThreadLocalMode(LLVMValueRef GlobalVar, LLVMThreadLocalMode Mode) {
	GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);

	switch (Mode) {
	case LLVMNotThreadLocal:
	GV->setThreadLocalMode(GlobalVariable::NotThreadLocal);
	break;
	case LLVMGeneralDynamicTLSModel:
	GV->setThreadLocalMode(GlobalVariable::GeneralDynamicTLSModel);
	break;
	case LLVMLocalDynamicTLSModel:
	GV->setThreadLocalMode(GlobalVariable::LocalDynamicTLSModel);
	break;
	case LLVMInitialExecTLSModel:
	GV->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
	break;
	case LLVMLocalExecTLSModel:
	GV->setThreadLocalMode(GlobalVariable::LocalExecTLSModel);
	break;
	}
	}

	LLVMBool LLVMIsExternallyInitialized(LLVMValueRef GlobalVar) {
	return unwrap<GlobalVariable>(GlobalVar)->isExternallyInitialized();
	}

	void LLVMSetExternallyInitialized(LLVMValueRef GlobalVar, LLVMBool IsExtInit) {
	unwrap<GlobalVariable>(GlobalVar)->setExternallyInitialized(IsExtInit);
	}

	/--.. Operations on aliases ......................................--/

	LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
	const char *Name) {
	auto *PTy = cast<PointerType>(unwrap(Ty));
	return wrap(GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
	GlobalValue::ExternalLinkage, Name,
	unwrap<Constant>(Aliasee), unwrap(M)));
	}

	LLVMValueRef LLVMGetNamedGlobalAlias(LLVMModuleRef M,
	const char *Name, size_t NameLen) {
	return wrap(unwrap(M)->getNamedAlias(Name));
	}

	LLVMValueRef LLVMGetFirstGlobalAlias(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::alias_iterator I = Mod->alias_begin();
	if (I == Mod->alias_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetLastGlobalAlias(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::alias_iterator I = Mod->alias_end();
	if (I == Mod->alias_begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMValueRef LLVMGetNextGlobalAlias(LLVMValueRef GA) {
	GlobalAlias *Alias = unwrap<GlobalAlias>(GA);
	Module::alias_iterator I(Alias);
	if (++I == Alias->getParent()->alias_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetPreviousGlobalAlias(LLVMValueRef GA) {
	GlobalAlias *Alias = unwrap<GlobalAlias>(GA);
	Module::alias_iterator I(Alias);
	if (I == Alias->getParent()->alias_begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMValueRef LLVMAliasGetAliasee(LLVMValueRef Alias) {
	return wrap(unwrap<GlobalAlias>(Alias)->getAliasee());
	}

	void LLVMAliasSetAliasee(LLVMValueRef Alias, LLVMValueRef Aliasee) {
	unwrap<GlobalAlias>(Alias)->setAliasee(unwrap<Constant>(Aliasee));
	}

	/--.. Operations on functions .............................................--/

	LLVMValueRef LLVMAddFunction(LLVMModuleRef M, const char *Name,
	LLVMTypeRef FunctionTy) {
	return wrap(Function::Create(unwrap<FunctionType>(FunctionTy),
	GlobalValue::ExternalLinkage, Name, unwrap(M)));
	}

	LLVMValueRef LLVMGetNamedFunction(LLVMModuleRef M, const char *Name) {
	return wrap(unwrap(M)->getFunction(Name));
	}

	LLVMValueRef LLVMGetFirstFunction(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::iterator I = Mod->begin();
	if (I == Mod->end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::iterator I = Mod->end();
	if (I == Mod->begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMValueRef LLVMGetNextFunction(LLVMValueRef Fn) {
	Function *Func = unwrap<Function>(Fn);
	Module::iterator I(Func);
	if (++I == Func->getParent()->end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetPreviousFunction(LLVMValueRef Fn) {
	Function *Func = unwrap<Function>(Fn);
	Module::iterator I(Func);
	if (I == Func->getParent()->begin())
	return nullptr;
	return wrap(&*--I);
	}

	void LLVMDeleteFunction(LLVMValueRef Fn) {
	unwrap<Function>(Fn)->eraseFromParent();
	}

	LLVMBool LLVMHasPersonalityFn(LLVMValueRef Fn) {
	return unwrap<Function>(Fn)->hasPersonalityFn();
	}

	LLVMValueRef LLVMGetPersonalityFn(LLVMValueRef Fn) {
	return wrap(unwrap<Function>(Fn)->getPersonalityFn());
	}

	void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn) {
	unwrap<Function>(Fn)->setPersonalityFn(unwrap<Constant>(PersonalityFn));
	}

	unsigned LLVMGetIntrinsicID(LLVMValueRef Fn) {
	if (Function *F = dyn_cast<Function>(unwrap(Fn)))
	return F->getIntrinsicID();
	return 0;
	}

	static Intrinsic::ID llvm_map_to_intrinsic_id(unsigned ID) {
	assert(ID < llvm::Intrinsic::num_intrinsics && "Intrinsic ID out of range");
	return llvm::Intrinsic::ID(ID);
	}

	LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod,
	unsigned ID,
	LLVMTypeRef *ParamTypes,
	size_t ParamCount) {
	ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
	auto IID = llvm_map_to_intrinsic_id(ID);
	return wrap(llvm::Intrinsic::getDeclaration(unwrap(Mod), IID, Tys));
	}

	const char LLVMIntrinsicGetName(unsigned ID, size_t NameLength) {
	auto IID = llvm_map_to_intrinsic_id(ID);
	auto Str = llvm::Intrinsic::getName(IID);
	*NameLength = Str.size();
	return Str.data();
	}

	LLVMTypeRef LLVMIntrinsicGetType(LLVMContextRef Ctx, unsigned ID,
	LLVMTypeRef *ParamTypes, size_t ParamCount) {
	auto IID = llvm_map_to_intrinsic_id(ID);
	ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
	return wrap(llvm::Intrinsic::getType(*unwrap(Ctx), IID, Tys));
	}

	const char *LLVMIntrinsicCopyOverloadedName(unsigned ID,
	LLVMTypeRef *ParamTypes,
	size_t ParamCount,
	size_t *NameLength) {
	auto IID = llvm_map_to_intrinsic_id(ID);
	ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
	auto Str = llvm::Intrinsic::getName(IID, Tys);
	*NameLength = Str.length();
	return strdup(Str.c_str());
	}

	unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen) {
	return Function::lookupIntrinsicID({Name, NameLen});
	}

	LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID) {
	auto IID = llvm_map_to_intrinsic_id(ID);
	return llvm::Intrinsic::isOverloaded(IID);
	}

	unsigned LLVMGetFunctionCallConv(LLVMValueRef Fn) {
	return unwrap<Function>(Fn)->getCallingConv();
	}

	void LLVMSetFunctionCallConv(LLVMValueRef Fn, unsigned CC) {
	return unwrap<Function>(Fn)->setCallingConv(
	static_cast<CallingConv::ID>(CC));
	}

	const char *LLVMGetGC(LLVMValueRef Fn) {
	Function *F = unwrap<Function>(Fn);
	return F->hasGC()? F->getGC().c_str() : nullptr;
	}

	void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
	Function *F = unwrap<Function>(Fn);
	if (GC)
	F->setGC(GC);
	else
	F->clearGC();
	}

	void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
	LLVMAttributeRef A) {
	unwrap<Function>(F)->addAttribute(Idx, unwrap(A));
	}

	unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) {
	auto AS = unwrap<Function>(F)->getAttributes().getAttributes(Idx);
	return AS.getNumAttributes();
	}

	void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
	LLVMAttributeRef *Attrs) {
	auto AS = unwrap<Function>(F)->getAttributes().getAttributes(Idx);
	for (auto A : AS)
	*Attrs++ = wrap(A);
	}

	LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F,
	LLVMAttributeIndex Idx,
	unsigned KindID) {
	return wrap(unwrap<Function>(F)->getAttribute(Idx,
	(Attribute::AttrKind)KindID));
	}

	LLVMAttributeRef LLVMGetStringAttributeAtIndex(LLVMValueRef F,
	LLVMAttributeIndex Idx,
	const char *K, unsigned KLen) {
	return wrap(unwrap<Function>(F)->getAttribute(Idx, StringRef(K, KLen)));
	}

	void LLVMRemoveEnumAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
	unsigned KindID) {
	unwrap<Function>(F)->removeAttribute(Idx, (Attribute::AttrKind)KindID);
	}

	void LLVMRemoveStringAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
	const char *K, unsigned KLen) {
	unwrap<Function>(F)->removeAttribute(Idx, StringRef(K, KLen));
	}

	void LLVMAddTargetDependentFunctionAttr(LLVMValueRef Fn, const char *A,
	const char *V) {
	Function *Func = unwrap<Function>(Fn);
	Attribute Attr = Attribute::get(Func->getContext(), A, V);
	Func->addAttribute(AttributeList::FunctionIndex, Attr);
	}

	/--.. Operations on parameters ............................................--/

	unsigned LLVMCountParams(LLVMValueRef FnRef) {
	// This function is strictly redundant to
	// LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(FnRef)))
	return unwrap<Function>(FnRef)->arg_size();
	}

	void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) {
	Function *Fn = unwrap<Function>(FnRef);
	for (Function::arg_iterator I = Fn->arg_begin(),
	E = Fn->arg_end(); I != E; I++)
	ParamRefs++ = wrap(&I);
	}

	LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) {
	Function *Fn = unwrap<Function>(FnRef);
	return wrap(&Fn->arg_begin()[index]);
	}

	LLVMValueRef LLVMGetParamParent(LLVMValueRef V) {
	return wrap(unwrap<Argument>(V)->getParent());
	}

	LLVMValueRef LLVMGetFirstParam(LLVMValueRef Fn) {
	Function *Func = unwrap<Function>(Fn);
	Function::arg_iterator I = Func->arg_begin();
	if (I == Func->arg_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
	Function *Func = unwrap<Function>(Fn);
	Function::arg_iterator I = Func->arg_end();
	if (I == Func->arg_begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) {
	Argument *A = unwrap<Argument>(Arg);
	Function *Fn = A->getParent();
	if (A->getArgNo() + 1 >= Fn->arg_size())
	return nullptr;
	return wrap(&Fn->arg_begin()[A->getArgNo() + 1]);
	}

	LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
	Argument *A = unwrap<Argument>(Arg);
	if (A->getArgNo() == 0)
	return nullptr;
	return wrap(&A->getParent()->arg_begin()[A->getArgNo() - 1]);
	}

	void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
	Argument *A = unwrap<Argument>(Arg);
	A->addAttr(Attribute::getWithAlignment(A->getContext(), align));
	}

	/--.. Operations on ifuncs ................................................--/

	LLVMValueRef LLVMAddGlobalIFunc(LLVMModuleRef M,
	const char *Name, size_t NameLen,
	LLVMTypeRef Ty, unsigned AddrSpace,
	LLVMValueRef Resolver) {
	return wrap(GlobalIFunc::create(unwrap(Ty), AddrSpace,
	GlobalValue::ExternalLinkage,
	StringRef(Name, NameLen),
	unwrap<Constant>(Resolver), unwrap(M)));
	}

	LLVMValueRef LLVMGetNamedGlobalIFunc(LLVMModuleRef M,
	const char *Name, size_t NameLen) {
	return wrap(unwrap(M)->getNamedIFunc(StringRef(Name, NameLen)));
	}

	LLVMValueRef LLVMGetFirstGlobalIFunc(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::ifunc_iterator I = Mod->ifunc_begin();
	if (I == Mod->ifunc_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetLastGlobalIFunc(LLVMModuleRef M) {
	Module *Mod = unwrap(M);
	Module::ifunc_iterator I = Mod->ifunc_end();
	if (I == Mod->ifunc_begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMValueRef LLVMGetNextGlobalIFunc(LLVMValueRef IFunc) {
	GlobalIFunc *GIF = unwrap<GlobalIFunc>(IFunc);
	Module::ifunc_iterator I(GIF);
	if (++I == GIF->getParent()->ifunc_end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetPreviousGlobalIFunc(LLVMValueRef IFunc) {
	GlobalIFunc *GIF = unwrap<GlobalIFunc>(IFunc);
	Module::ifunc_iterator I(GIF);
	if (I == GIF->getParent()->ifunc_begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMValueRef LLVMGetGlobalIFuncResolver(LLVMValueRef IFunc) {
	return wrap(unwrap<GlobalIFunc>(IFunc)->getResolver());
	}

	void LLVMSetGlobalIFuncResolver(LLVMValueRef IFunc, LLVMValueRef Resolver) {
	unwrap<GlobalIFunc>(IFunc)->setResolver(unwrap<Constant>(Resolver));
	}

	void LLVMEraseGlobalIFunc(LLVMValueRef IFunc) {
	unwrap<GlobalIFunc>(IFunc)->eraseFromParent();
	}

	void LLVMRemoveGlobalIFunc(LLVMValueRef IFunc) {
	unwrap<GlobalIFunc>(IFunc)->removeFromParent();
	}

	/--.. Operations on basic blocks ..........................................--/

	LLVMValueRef LLVMBasicBlockAsValue(LLVMBasicBlockRef BB) {
	return wrap(static_cast<Value*>(unwrap(BB)));
	}

	LLVMBool LLVMValueIsBasicBlock(LLVMValueRef Val) {
	return isa<BasicBlock>(unwrap(Val));
	}

	LLVMBasicBlockRef LLVMValueAsBasicBlock(LLVMValueRef Val) {
	return wrap(unwrap<BasicBlock>(Val));
	}

	const char *LLVMGetBasicBlockName(LLVMBasicBlockRef BB) {
	return unwrap(BB)->getName().data();
	}

	LLVMValueRef LLVMGetBasicBlockParent(LLVMBasicBlockRef BB) {
	return wrap(unwrap(BB)->getParent());
	}

	LLVMValueRef LLVMGetBasicBlockTerminator(LLVMBasicBlockRef BB) {
	return wrap(unwrap(BB)->getTerminator());
	}

	unsigned LLVMCountBasicBlocks(LLVMValueRef FnRef) {
	return unwrap<Function>(FnRef)->size();
	}

	void LLVMGetBasicBlocks(LLVMValueRef FnRef, LLVMBasicBlockRef *BasicBlocksRefs){
	Function *Fn = unwrap<Function>(FnRef);
	for (BasicBlock &BB : *Fn)
	*BasicBlocksRefs++ = wrap(&BB);
	}

	LLVMBasicBlockRef LLVMGetEntryBasicBlock(LLVMValueRef Fn) {
	return wrap(&unwrap<Function>(Fn)->getEntryBlock());
	}

	LLVMBasicBlockRef LLVMGetFirstBasicBlock(LLVMValueRef Fn) {
	Function *Func = unwrap<Function>(Fn);
	Function::iterator I = Func->begin();
	if (I == Func->end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) {
	Function *Func = unwrap<Function>(Fn);
	Function::iterator I = Func->end();
	if (I == Func->begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMBasicBlockRef LLVMGetNextBasicBlock(LLVMBasicBlockRef BB) {
	BasicBlock *Block = unwrap(BB);
	Function::iterator I(Block);
	if (++I == Block->getParent()->end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMBasicBlockRef LLVMGetPreviousBasicBlock(LLVMBasicBlockRef BB) {
	BasicBlock *Block = unwrap(BB);
	Function::iterator I(Block);
	if (I == Block->getParent()->begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMBasicBlockRef LLVMCreateBasicBlockInContext(LLVMContextRef C,
	const char *Name) {
	return wrap(llvm::BasicBlock::Create(*unwrap(C), Name));
	}

	void LLVMInsertExistingBasicBlockAfterInsertBlock(LLVMBuilderRef Builder,
	LLVMBasicBlockRef BB) {
	BasicBlock *ToInsert = unwrap(BB);
	BasicBlock *CurBB = unwrap(Builder)->GetInsertBlock();
	assert(CurBB && "current insertion point is invalid!");
	CurBB->getParent()->getBasicBlockList().insertAfter(CurBB->getIterator(),
	ToInsert);
	}

	void LLVMAppendExistingBasicBlock(LLVMValueRef Fn,
	LLVMBasicBlockRef BB) {
	unwrap<Function>(Fn)->getBasicBlockList().push_back(unwrap(BB));
	}

	LLVMBasicBlockRef LLVMAppendBasicBlockInContext(LLVMContextRef C,
	LLVMValueRef FnRef,
	const char *Name) {
	return wrap(BasicBlock::Create(*unwrap(C), Name, unwrap<Function>(FnRef)));
	}

	LLVMBasicBlockRef LLVMAppendBasicBlock(LLVMValueRef FnRef, const char *Name) {
	return LLVMAppendBasicBlockInContext(LLVMGetGlobalContext(), FnRef, Name);
	}

	LLVMBasicBlockRef LLVMInsertBasicBlockInContext(LLVMContextRef C,
	LLVMBasicBlockRef BBRef,
	const char *Name) {
	BasicBlock *BB = unwrap(BBRef);
	return wrap(BasicBlock::Create(*unwrap(C), Name, BB->getParent(), BB));
	}

	LLVMBasicBlockRef LLVMInsertBasicBlock(LLVMBasicBlockRef BBRef,
	const char *Name) {
	return LLVMInsertBasicBlockInContext(LLVMGetGlobalContext(), BBRef, Name);
	}

	void LLVMDeleteBasicBlock(LLVMBasicBlockRef BBRef) {
	unwrap(BBRef)->eraseFromParent();
	}

	void LLVMRemoveBasicBlockFromParent(LLVMBasicBlockRef BBRef) {
	unwrap(BBRef)->removeFromParent();
	}

	void LLVMMoveBasicBlockBefore(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) {
	unwrap(BB)->moveBefore(unwrap(MovePos));
	}

	void LLVMMoveBasicBlockAfter(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) {
	unwrap(BB)->moveAfter(unwrap(MovePos));
	}

	/--.. Operations on instructions ..........................................--/

	LLVMBasicBlockRef LLVMGetInstructionParent(LLVMValueRef Inst) {
	return wrap(unwrap<Instruction>(Inst)->getParent());
	}

	LLVMValueRef LLVMGetFirstInstruction(LLVMBasicBlockRef BB) {
	BasicBlock *Block = unwrap(BB);
	BasicBlock::iterator I = Block->begin();
	if (I == Block->end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) {
	BasicBlock *Block = unwrap(BB);
	BasicBlock::iterator I = Block->end();
	if (I == Block->begin())
	return nullptr;
	return wrap(&*--I);
	}

	LLVMValueRef LLVMGetNextInstruction(LLVMValueRef Inst) {
	Instruction *Instr = unwrap<Instruction>(Inst);
	BasicBlock::iterator I(Instr);
	if (++I == Instr->getParent()->end())
	return nullptr;
	return wrap(&*I);
	}

	LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) {
	Instruction *Instr = unwrap<Instruction>(Inst);
	BasicBlock::iterator I(Instr);
	if (I == Instr->getParent()->begin())
	return nullptr;
	return wrap(&*--I);
	}

	void LLVMInstructionRemoveFromParent(LLVMValueRef Inst) {
	unwrap<Instruction>(Inst)->removeFromParent();
	}

	void LLVMInstructionEraseFromParent(LLVMValueRef Inst) {
	unwrap<Instruction>(Inst)->eraseFromParent();
	}

	LLVMIntPredicate LLVMGetICmpPredicate(LLVMValueRef Inst) {
	if (ICmpInst *I = dyn_cast<ICmpInst>(unwrap(Inst)))
	return (LLVMIntPredicate)I->getPredicate();
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(unwrap(Inst)))
	if (CE->getOpcode() == Instruction::ICmp)
	return (LLVMIntPredicate)CE->getPredicate();
	return (LLVMIntPredicate)0;
	}

	LLVMRealPredicate LLVMGetFCmpPredicate(LLVMValueRef Inst) {
	if (FCmpInst *I = dyn_cast<FCmpInst>(unwrap(Inst)))
	return (LLVMRealPredicate)I->getPredicate();
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(unwrap(Inst)))
	if (CE->getOpcode() == Instruction::FCmp)
	return (LLVMRealPredicate)CE->getPredicate();
	return (LLVMRealPredicate)0;
	}

	LLVMOpcode LLVMGetInstructionOpcode(LLVMValueRef Inst) {
	if (Instruction *C = dyn_cast<Instruction>(unwrap(Inst)))
	return map_to_llvmopcode(C->getOpcode());
	return (LLVMOpcode)0;
	}

	LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst) {
	if (Instruction *C = dyn_cast<Instruction>(unwrap(Inst)))
	return wrap(C->clone());
	return nullptr;
	}

	LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst) {
	Instruction *I = dyn_cast<Instruction>(unwrap(Inst));
	return (I && I->isTerminator()) ? wrap(I) : nullptr;
	}

	unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) {
	if (FuncletPadInst *FPI = dyn_cast<FuncletPadInst>(unwrap(Instr))) {
	return FPI->getNumArgOperands();
	}
	return unwrap<CallBase>(Instr)->getNumArgOperands();
	}

	/--.. Call and invoke instructions ........................................--/

	unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr) {
	return unwrap<CallBase>(Instr)->getCallingConv();
	}

	void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
	return unwrap<CallBase>(Instr)->setCallingConv(
	static_cast<CallingConv::ID>(CC));
	}

	void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
	unsigned align) {
	auto *Call = unwrap<CallBase>(Instr);
	Attribute AlignAttr = Attribute::getWithAlignment(Call->getContext(), align);
	Call->addAttribute(index, AlignAttr);
	}

	void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
	LLVMAttributeRef A) {
	unwrap<CallBase>(C)->addAttribute(Idx, unwrap(A));
	}

	unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C,
	LLVMAttributeIndex Idx) {
	auto *Call = unwrap<CallBase>(C);
	auto AS = Call->getAttributes().getAttributes(Idx);
	return AS.getNumAttributes();
	}

	void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
	LLVMAttributeRef *Attrs) {
	auto *Call = unwrap<CallBase>(C);
	auto AS = Call->getAttributes().getAttributes(Idx);
	for (auto A : AS)
	*Attrs++ = wrap(A);
	}

	LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C,
	LLVMAttributeIndex Idx,
	unsigned KindID) {
	return wrap(
	unwrap<CallBase>(C)->getAttribute(Idx, (Attribute::AttrKind)KindID));
	}

	LLVMAttributeRef LLVMGetCallSiteStringAttribute(LLVMValueRef C,
	LLVMAttributeIndex Idx,
	const char *K, unsigned KLen) {
	return wrap(unwrap<CallBase>(C)->getAttribute(Idx, StringRef(K, KLen)));
	}

	void LLVMRemoveCallSiteEnumAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
	unsigned KindID) {
	unwrap<CallBase>(C)->removeAttribute(Idx, (Attribute::AttrKind)KindID);
	}

	void LLVMRemoveCallSiteStringAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
	const char *K, unsigned KLen) {
	unwrap<CallBase>(C)->removeAttribute(Idx, StringRef(K, KLen));
	}

	LLVMValueRef LLVMGetCalledValue(LLVMValueRef Instr) {
	return wrap(unwrap<CallBase>(Instr)->getCalledValue());
	}

	LLVMTypeRef LLVMGetCalledFunctionType(LLVMValueRef Instr) {
	return wrap(unwrap<CallBase>(Instr)->getFunctionType());
	}

	/--.. Operations on call instructions (only) ..............................--/

	LLVMBool LLVMIsTailCall(LLVMValueRef Call) {
	return unwrap<CallInst>(Call)->isTailCall();
	}

	void LLVMSetTailCall(LLVMValueRef Call, LLVMBool isTailCall) {
	unwrap<CallInst>(Call)->setTailCall(isTailCall);
	}

	/--.. Operations on invoke instructions (only) ............................--/

	LLVMBasicBlockRef LLVMGetNormalDest(LLVMValueRef Invoke) {
	return wrap(unwrap<InvokeInst>(Invoke)->getNormalDest());
	}

	LLVMBasicBlockRef LLVMGetUnwindDest(LLVMValueRef Invoke) {
	if (CleanupReturnInst *CRI = dyn_cast<CleanupReturnInst>(unwrap(Invoke))) {
	return wrap(CRI->getUnwindDest());
	} else if (CatchSwitchInst *CSI = dyn_cast<CatchSwitchInst>(unwrap(Invoke))) {
	return wrap(CSI->getUnwindDest());
	}
	return wrap(unwrap<InvokeInst>(Invoke)->getUnwindDest());
	}

	void LLVMSetNormalDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
	unwrap<InvokeInst>(Invoke)->setNormalDest(unwrap(B));
	}

	void LLVMSetUnwindDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
	if (CleanupReturnInst *CRI = dyn_cast<CleanupReturnInst>(unwrap(Invoke))) {
	return CRI->setUnwindDest(unwrap(B));
	} else if (CatchSwitchInst *CSI = dyn_cast<CatchSwitchInst>(unwrap(Invoke))) {
	return CSI->setUnwindDest(unwrap(B));
	}
	unwrap<InvokeInst>(Invoke)->setUnwindDest(unwrap(B));
	}

	/--.. Operations on terminators ...........................................--/

	unsigned LLVMGetNumSuccessors(LLVMValueRef Term) {
	return unwrap<Instruction>(Term)->getNumSuccessors();
	}

	LLVMBasicBlockRef LLVMGetSuccessor(LLVMValueRef Term, unsigned i) {
	return wrap(unwrap<Instruction>(Term)->getSuccessor(i));
	}

	void LLVMSetSuccessor(LLVMValueRef Term, unsigned i, LLVMBasicBlockRef block) {
	return unwrap<Instruction>(Term)->setSuccessor(i, unwrap(block));
	}

	/--.. Operations on branch instructions (only) ............................--/

	LLVMBool LLVMIsConditional(LLVMValueRef Branch) {
	return unwrap<BranchInst>(Branch)->isConditional();
	}

	LLVMValueRef LLVMGetCondition(LLVMValueRef Branch) {
	return wrap(unwrap<BranchInst>(Branch)->getCondition());
	}

	void LLVMSetCondition(LLVMValueRef Branch, LLVMValueRef Cond) {
	return unwrap<BranchInst>(Branch)->setCondition(unwrap(Cond));
	}

	/--.. Operations on switch instructions (only) ............................--/

	LLVMBasicBlockRef LLVMGetSwitchDefaultDest(LLVMValueRef Switch) {
	return wrap(unwrap<SwitchInst>(Switch)->getDefaultDest());
	}

	/--.. Operations on alloca instructions (only) ............................--/

	LLVMTypeRef LLVMGetAllocatedType(LLVMValueRef Alloca) {
	return wrap(unwrap<AllocaInst>(Alloca)->getAllocatedType());
	}

	/--.. Operations on gep instructions (only) ...............................--/

	LLVMBool LLVMIsInBounds(LLVMValueRef GEP) {
	return unwrap<GetElementPtrInst>(GEP)->isInBounds();
	}

	void LLVMSetIsInBounds(LLVMValueRef GEP, LLVMBool InBounds) {
	return unwrap<GetElementPtrInst>(GEP)->setIsInBounds(InBounds);
	}

	/--.. Operations on phi nodes .............................................--/

	void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues,
	LLVMBasicBlockRef *IncomingBlocks, unsigned Count) {
	PHINode *PhiVal = unwrap<PHINode>(PhiNode);
	for (unsigned I = 0; I != Count; ++I)
	PhiVal->addIncoming(unwrap(IncomingValues[I]), unwrap(IncomingBlocks[I]));
	}

	unsigned LLVMCountIncoming(LLVMValueRef PhiNode) {
	return unwrap<PHINode>(PhiNode)->getNumIncomingValues();
	}

	LLVMValueRef LLVMGetIncomingValue(LLVMValueRef PhiNode, unsigned Index) {
	return wrap(unwrap<PHINode>(PhiNode)->getIncomingValue(Index));
	}

	LLVMBasicBlockRef LLVMGetIncomingBlock(LLVMValueRef PhiNode, unsigned Index) {
	return wrap(unwrap<PHINode>(PhiNode)->getIncomingBlock(Index));
	}

	/--.. Operations on extractvalue and insertvalue nodes ....................--/

	unsigned LLVMGetNumIndices(LLVMValueRef Inst) {
	auto *I = unwrap(Inst);
	if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
	return GEP->getNumIndices();
	if (auto *EV = dyn_cast<ExtractValueInst>(I))
	return EV->getNumIndices();
	if (auto *IV = dyn_cast<InsertValueInst>(I))
	return IV->getNumIndices();
	if (auto *CE = dyn_cast<ConstantExpr>(I))
	return CE->getIndices().size();
	llvm_unreachable(
	"LLVMGetNumIndices applies only to extractvalue and insertvalue!");
	}

	const unsigned *LLVMGetIndices(LLVMValueRef Inst) {
	auto *I = unwrap(Inst);
	if (auto *EV = dyn_cast<ExtractValueInst>(I))
	return EV->getIndices().data();
	if (auto *IV = dyn_cast<InsertValueInst>(I))
	return IV->getIndices().data();
	if (auto *CE = dyn_cast<ConstantExpr>(I))
	return CE->getIndices().data();
	llvm_unreachable(
	"LLVMGetIndices applies only to extractvalue and insertvalue!");
	}


	/===-- Instruction builders ----------------------------------------------===/

	LLVMBuilderRef LLVMCreateBuilderInContext(LLVMContextRef C) {
	return wrap(new IRBuilder<>(*unwrap(C)));
	}

	LLVMBuilderRef LLVMCreateBuilder(void) {
	return LLVMCreateBuilderInContext(LLVMGetGlobalContext());
	}

	void LLVMPositionBuilder(LLVMBuilderRef Builder, LLVMBasicBlockRef Block,
	LLVMValueRef Instr) {
	BasicBlock *BB = unwrap(Block);
	auto I = Instr ? unwrap<Instruction>(Instr)->getIterator() : BB->end();
	unwrap(Builder)->SetInsertPoint(BB, I);
	}

	void LLVMPositionBuilderBefore(LLVMBuilderRef Builder, LLVMValueRef Instr) {
	Instruction *I = unwrap<Instruction>(Instr);
	unwrap(Builder)->SetInsertPoint(I->getParent(), I->getIterator());
	}

	void LLVMPositionBuilderAtEnd(LLVMBuilderRef Builder, LLVMBasicBlockRef Block) {
	BasicBlock *BB = unwrap(Block);
	unwrap(Builder)->SetInsertPoint(BB);
	}

	LLVMBasicBlockRef LLVMGetInsertBlock(LLVMBuilderRef Builder) {
	return wrap(unwrap(Builder)->GetInsertBlock());
	}

	void LLVMClearInsertionPosition(LLVMBuilderRef Builder) {
	unwrap(Builder)->ClearInsertionPoint();
	}

	void LLVMInsertIntoBuilder(LLVMBuilderRef Builder, LLVMValueRef Instr) {
	unwrap(Builder)->Insert(unwrap<Instruction>(Instr));
	}

	void LLVMInsertIntoBuilderWithName(LLVMBuilderRef Builder, LLVMValueRef Instr,
	const char *Name) {
	unwrap(Builder)->Insert(unwrap<Instruction>(Instr), Name);
	}

	void LLVMDisposeBuilder(LLVMBuilderRef Builder) {
	delete unwrap(Builder);
	}

	/--.. Metadata builders ...................................................--/

	LLVMMetadataRef LLVMGetCurrentDebugLocation2(LLVMBuilderRef Builder) {
	return wrap(unwrap(Builder)->getCurrentDebugLocation().getAsMDNode());
	}

	void LLVMSetCurrentDebugLocation2(LLVMBuilderRef Builder, LLVMMetadataRef Loc) {
	if (Loc)
	unwrap(Builder)->SetCurrentDebugLocation(DebugLoc(unwrap<MDNode>(Loc)));
	else
	unwrap(Builder)->SetCurrentDebugLocation(DebugLoc());
	}

	void LLVMSetCurrentDebugLocation(LLVMBuilderRef Builder, LLVMValueRef L) {
	MDNode *Loc =
	L ? cast<MDNode>(unwrap<MetadataAsValue>(L)->getMetadata()) : nullptr;
	unwrap(Builder)->SetCurrentDebugLocation(DebugLoc(Loc));
	}

	LLVMValueRef LLVMGetCurrentDebugLocation(LLVMBuilderRef Builder) {
	LLVMContext &Context = unwrap(Builder)->getContext();
	return wrap(MetadataAsValue::get(
	Context, unwrap(Builder)->getCurrentDebugLocation().getAsMDNode()));
	}

	void LLVMSetInstDebugLocation(LLVMBuilderRef Builder, LLVMValueRef Inst) {
	unwrap(Builder)->SetInstDebugLocation(unwrap<Instruction>(Inst));
	}

	void LLVMBuilderSetDefaultFPMathTag(LLVMBuilderRef Builder,
	LLVMMetadataRef FPMathTag) {

	unwrap(Builder)->setDefaultFPMathTag(FPMathTag
	? unwrap<MDNode>(FPMathTag)
	: nullptr);
	}

	LLVMMetadataRef LLVMBuilderGetDefaultFPMathTag(LLVMBuilderRef Builder) {
	return wrap(unwrap(Builder)->getDefaultFPMathTag());
	}

	/--.. Instruction builders ................................................--/

	LLVMValueRef LLVMBuildRetVoid(LLVMBuilderRef B) {
	return wrap(unwrap(B)->CreateRetVoid());
	}

	LLVMValueRef LLVMBuildRet(LLVMBuilderRef B, LLVMValueRef V) {
	return wrap(unwrap(B)->CreateRet(unwrap(V)));
	}

	LLVMValueRef LLVMBuildAggregateRet(LLVMBuilderRef B, LLVMValueRef *RetVals,
	unsigned N) {
	return wrap(unwrap(B)->CreateAggregateRet(unwrap(RetVals), N));
	}

	LLVMValueRef LLVMBuildBr(LLVMBuilderRef B, LLVMBasicBlockRef Dest) {
	return wrap(unwrap(B)->CreateBr(unwrap(Dest)));
	}

	LLVMValueRef LLVMBuildCondBr(LLVMBuilderRef B, LLVMValueRef If,
	LLVMBasicBlockRef Then, LLVMBasicBlockRef Else) {
	return wrap(unwrap(B)->CreateCondBr(unwrap(If), unwrap(Then), unwrap(Else)));
	}

	LLVMValueRef LLVMBuildSwitch(LLVMBuilderRef B, LLVMValueRef V,
	LLVMBasicBlockRef Else, unsigned NumCases) {
	return wrap(unwrap(B)->CreateSwitch(unwrap(V), unwrap(Else), NumCases));
	}

	LLVMValueRef LLVMBuildIndirectBr(LLVMBuilderRef B, LLVMValueRef Addr,
	unsigned NumDests) {
	return wrap(unwrap(B)->CreateIndirectBr(unwrap(Addr), NumDests));
	}

	LLVMValueRef LLVMBuildInvoke(LLVMBuilderRef B, LLVMValueRef Fn,
	LLVMValueRef *Args, unsigned NumArgs,
	LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch,
	const char *Name) {
	Value *V = unwrap(Fn);
	FunctionType *FnT =
	cast<FunctionType>(cast<PointerType>(V->getType())->getElementType());

	return wrap(
	unwrap(B)->CreateInvoke(FnT, unwrap(Fn), unwrap(Then), unwrap(Catch),
	makeArrayRef(unwrap(Args), NumArgs), Name));
	}

	LLVMValueRef LLVMBuildInvoke2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn,
	LLVMValueRef *Args, unsigned NumArgs,
	LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch,
	const char *Name) {
	return wrap(unwrap(B)->CreateInvoke(
	unwrap<FunctionType>(Ty), unwrap(Fn), unwrap(Then), unwrap(Catch),
	makeArrayRef(unwrap(Args), NumArgs), Name));
	}

	LLVMValueRef LLVMBuildLandingPad(LLVMBuilderRef B, LLVMTypeRef Ty,
	LLVMValueRef PersFn, unsigned NumClauses,
	const char *Name) {
	// The personality used to live on the landingpad instruction, but now it
	// lives on the parent function. For compatibility, take the provided
	// personality and put it on the parent function.
	if (PersFn)
	unwrap(B)->GetInsertBlock()->getParent()->setPersonalityFn(
	cast<Function>(unwrap(PersFn)));
	return wrap(unwrap(B)->CreateLandingPad(unwrap(Ty), NumClauses, Name));
	}

	LLVMValueRef LLVMBuildCatchPad(LLVMBuilderRef B, LLVMValueRef ParentPad,
	LLVMValueRef *Args, unsigned NumArgs,
	const char *Name) {
	return wrap(unwrap(B)->CreateCatchPad(unwrap(ParentPad),
	makeArrayRef(unwrap(Args), NumArgs),
	Name));
	}

	LLVMValueRef LLVMBuildCleanupPad(LLVMBuilderRef B, LLVMValueRef ParentPad,
	LLVMValueRef *Args, unsigned NumArgs,
	const char *Name) {
	if (ParentPad == nullptr) {
	Type *Ty = Type::getTokenTy(unwrap(B)->getContext());
	ParentPad = wrap(Constant::getNullValue(Ty));
	}
	return wrap(unwrap(B)->CreateCleanupPad(unwrap(ParentPad),
	makeArrayRef(unwrap(Args), NumArgs),
	Name));
	}

	LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn) {
	return wrap(unwrap(B)->CreateResume(unwrap(Exn)));
	}

	LLVMValueRef LLVMBuildCatchSwitch(LLVMBuilderRef B, LLVMValueRef ParentPad,
	LLVMBasicBlockRef UnwindBB,
	unsigned NumHandlers, const char *Name) {
	if (ParentPad == nullptr) {
	Type *Ty = Type::getTokenTy(unwrap(B)->getContext());
	ParentPad = wrap(Constant::getNullValue(Ty));
	}
	return wrap(unwrap(B)->CreateCatchSwitch(unwrap(ParentPad), unwrap(UnwindBB),
	NumHandlers, Name));
	}

	LLVMValueRef LLVMBuildCatchRet(LLVMBuilderRef B, LLVMValueRef CatchPad,
	LLVMBasicBlockRef BB) {
	return wrap(unwrap(B)->CreateCatchRet(unwrap<CatchPadInst>(CatchPad),
	unwrap(BB)));
	}

	LLVMValueRef LLVMBuildCleanupRet(LLVMBuilderRef B, LLVMValueRef CatchPad,
	LLVMBasicBlockRef BB) {
	return wrap(unwrap(B)->CreateCleanupRet(unwrap<CleanupPadInst>(CatchPad),
	unwrap(BB)));
	}

	LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef B) {
	return wrap(unwrap(B)->CreateUnreachable());
	}

	void LLVMAddCase(LLVMValueRef Switch, LLVMValueRef OnVal,
	LLVMBasicBlockRef Dest) {
	unwrap<SwitchInst>(Switch)->addCase(unwrap<ConstantInt>(OnVal), unwrap(Dest));
	}

	void LLVMAddDestination(LLVMValueRef IndirectBr, LLVMBasicBlockRef Dest) {
	unwrap<IndirectBrInst>(IndirectBr)->addDestination(unwrap(Dest));
	}

	unsigned LLVMGetNumClauses(LLVMValueRef LandingPad) {
	return unwrap<LandingPadInst>(LandingPad)->getNumClauses();
	}

	LLVMValueRef LLVMGetClause(LLVMValueRef LandingPad, unsigned Idx) {
	return wrap(unwrap<LandingPadInst>(LandingPad)->getClause(Idx));
	}

	void LLVMAddClause(LLVMValueRef LandingPad, LLVMValueRef ClauseVal) {
	unwrap<LandingPadInst>(LandingPad)->
	addClause(cast<Constant>(unwrap(ClauseVal)));
	}

	LLVMBool LLVMIsCleanup(LLVMValueRef LandingPad) {
	return unwrap<LandingPadInst>(LandingPad)->isCleanup();
	}

	void LLVMSetCleanup(LLVMValueRef LandingPad, LLVMBool Val) {
	unwrap<LandingPadInst>(LandingPad)->setCleanup(Val);
	}

	void LLVMAddHandler(LLVMValueRef CatchSwitch, LLVMBasicBlockRef Dest) {
	unwrap<CatchSwitchInst>(CatchSwitch)->addHandler(unwrap(Dest));
	}

	unsigned LLVMGetNumHandlers(LLVMValueRef CatchSwitch) {
	return unwrap<CatchSwitchInst>(CatchSwitch)->getNumHandlers();
	}

	void LLVMGetHandlers(LLVMValueRef CatchSwitch, LLVMBasicBlockRef *Handlers) {
	CatchSwitchInst *CSI = unwrap<CatchSwitchInst>(CatchSwitch);
	for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(),
	E = CSI->handler_end(); I != E; ++I)
	Handlers++ = wrap(I);
	}

	LLVMValueRef LLVMGetParentCatchSwitch(LLVMValueRef CatchPad) {
	return wrap(unwrap<CatchPadInst>(CatchPad)->getCatchSwitch());
	}

	void LLVMSetParentCatchSwitch(LLVMValueRef CatchPad, LLVMValueRef CatchSwitch) {
	unwrap<CatchPadInst>(CatchPad)
	->setCatchSwitch(unwrap<CatchSwitchInst>(CatchSwitch));
	}

	/--.. Funclets ...........................................................--/

	LLVMValueRef LLVMGetArgOperand(LLVMValueRef Funclet, unsigned i) {
	return wrap(unwrap<FuncletPadInst>(Funclet)->getArgOperand(i));
	}

	void LLVMSetArgOperand(LLVMValueRef Funclet, unsigned i, LLVMValueRef value) {
	unwrap<FuncletPadInst>(Funclet)->setArgOperand(i, unwrap(value));
	}

	/--.. Arithmetic ..........................................................--/

	LLVMValueRef LLVMBuildAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateAdd(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildNSWAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateNSWAdd(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildNUWAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateNUWAdd(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildFAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateFAdd(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateSub(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildNSWSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateNSWSub(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildNUWSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateNUWSub(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildFSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateFSub(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateMul(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildNSWMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateNSWMul(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildNUWMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateNUWMul(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildFMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateFMul(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildUDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateUDiv(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildExactUDiv(LLVMBuilderRef B, LLVMValueRef LHS,
	LLVMValueRef RHS, const char *Name) {
	return wrap(unwrap(B)->CreateExactUDiv(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildSDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateSDiv(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildExactSDiv(LLVMBuilderRef B, LLVMValueRef LHS,
	LLVMValueRef RHS, const char *Name) {
	return wrap(unwrap(B)->CreateExactSDiv(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildFDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateFDiv(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildURem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateURem(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildSRem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateSRem(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildFRem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateFRem(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildShl(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateShl(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildLShr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateLShr(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildAShr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateAShr(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildAnd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateAnd(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildOr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateOr(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildXor(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateXor(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildBinOp(LLVMBuilderRef B, LLVMOpcode Op,
	LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateBinOp(Instruction::BinaryOps(map_from_llvmopcode(Op)), unwrap(LHS),
	unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
	return wrap(unwrap(B)->CreateNeg(unwrap(V), Name));
	}

	LLVMValueRef LLVMBuildNSWNeg(LLVMBuilderRef B, LLVMValueRef V,
	const char *Name) {
	return wrap(unwrap(B)->CreateNSWNeg(unwrap(V), Name));
	}

	LLVMValueRef LLVMBuildNUWNeg(LLVMBuilderRef B, LLVMValueRef V,
	const char *Name) {
	return wrap(unwrap(B)->CreateNUWNeg(unwrap(V), Name));
	}

	LLVMValueRef LLVMBuildFNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
	return wrap(unwrap(B)->CreateFNeg(unwrap(V), Name));
	}

	LLVMValueRef LLVMBuildNot(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
	return wrap(unwrap(B)->CreateNot(unwrap(V), Name));
	}

	/--.. Memory ..............................................................--/

	LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
	const char *Name) {
	Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
	Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
	AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
	Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
	ITy, unwrap(Ty), AllocSize,
	nullptr, nullptr, "");
	return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
	}

	LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
	LLVMValueRef Val, const char *Name) {
	Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
	Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
	AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
	Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
	ITy, unwrap(Ty), AllocSize,
	unwrap(Val), nullptr, "");
	return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
	}

	LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr,
	LLVMValueRef Val, LLVMValueRef Len,
	unsigned Align) {
	return wrap(unwrap(B)->CreateMemSet(unwrap(Ptr), unwrap(Val), unwrap(Len), Align));
	}

	LLVMValueRef LLVMBuildMemCpy(LLVMBuilderRef B,
	LLVMValueRef Dst, unsigned DstAlign,
	LLVMValueRef Src, unsigned SrcAlign,
	LLVMValueRef Size) {
	return wrap(unwrap(B)->CreateMemCpy(unwrap(Dst), DstAlign,
	unwrap(Src), SrcAlign,
	unwrap(Size)));
	}

	LLVMValueRef LLVMBuildMemMove(LLVMBuilderRef B,
	LLVMValueRef Dst, unsigned DstAlign,
	LLVMValueRef Src, unsigned SrcAlign,
	LLVMValueRef Size) {
	return wrap(unwrap(B)->CreateMemMove(unwrap(Dst), DstAlign,
	unwrap(Src), SrcAlign,
	unwrap(Size)));
	}

	LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
	const char *Name) {
	return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), nullptr, Name));
	}

	LLVMValueRef LLVMBuildArrayAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
	LLVMValueRef Val, const char *Name) {
	return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), unwrap(Val), Name));
	}

	LLVMValueRef LLVMBuildFree(LLVMBuilderRef B, LLVMValueRef PointerVal) {
	return wrap(unwrap(B)->Insert(
	CallInst::CreateFree(unwrap(PointerVal), unwrap(B)->GetInsertBlock())));
	}

	LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal,
	const char *Name) {
	Value *V = unwrap(PointerVal);
	PointerType *Ty = cast<PointerType>(V->getType());

	return wrap(unwrap(B)->CreateLoad(Ty->getElementType(), V, Name));
	}

	LLVMValueRef LLVMBuildLoad2(LLVMBuilderRef B, LLVMTypeRef Ty,
	LLVMValueRef PointerVal, const char *Name) {
	return wrap(unwrap(B)->CreateLoad(unwrap(Ty), unwrap(PointerVal), Name));
	}

	LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMValueRef PointerVal) {
	return wrap(unwrap(B)->CreateStore(unwrap(Val), unwrap(PointerVal)));
	}

	static AtomicOrdering mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) {
	switch (Ordering) {
	case LLVMAtomicOrderingNotAtomic: return AtomicOrdering::NotAtomic;
	case LLVMAtomicOrderingUnordered: return AtomicOrdering::Unordered;
	case LLVMAtomicOrderingMonotonic: return AtomicOrdering::Monotonic;
	case LLVMAtomicOrderingAcquire: return AtomicOrdering::Acquire;
	case LLVMAtomicOrderingRelease: return AtomicOrdering::Release;
	case LLVMAtomicOrderingAcquireRelease:
	return AtomicOrdering::AcquireRelease;
	case LLVMAtomicOrderingSequentiallyConsistent:
	return AtomicOrdering::SequentiallyConsistent;
	}

	llvm_unreachable("Invalid LLVMAtomicOrdering value!");
	}

	static LLVMAtomicOrdering mapToLLVMOrdering(AtomicOrdering Ordering) {
	switch (Ordering) {
	case AtomicOrdering::NotAtomic: return LLVMAtomicOrderingNotAtomic;
	case AtomicOrdering::Unordered: return LLVMAtomicOrderingUnordered;
	case AtomicOrdering::Monotonic: return LLVMAtomicOrderingMonotonic;
	case AtomicOrdering::Acquire: return LLVMAtomicOrderingAcquire;
	case AtomicOrdering::Release: return LLVMAtomicOrderingRelease;
	case AtomicOrdering::AcquireRelease:
	return LLVMAtomicOrderingAcquireRelease;
	case AtomicOrdering::SequentiallyConsistent:
	return LLVMAtomicOrderingSequentiallyConsistent;
	}

	llvm_unreachable("Invalid AtomicOrdering value!");
	}

	// TODO: Should this and other atomic instructions support building with
	// "syncscope"?
	LLVMValueRef LLVMBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering,
	LLVMBool isSingleThread, const char *Name) {
	return wrap(
	unwrap(B)->CreateFence(mapFromLLVMOrdering(Ordering),
	isSingleThread ? SyncScope::SingleThread
	: SyncScope::System,
	Name));
	}

	LLVMValueRef LLVMBuildGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
	LLVMValueRef *Indices, unsigned NumIndices,
	const char *Name) {
	ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
	Value *Val = unwrap(Pointer);
	Type *Ty =
	cast<PointerType>(Val->getType()->getScalarType())->getElementType();
	return wrap(unwrap(B)->CreateGEP(Ty, Val, IdxList, Name));
	}

	LLVMValueRef LLVMBuildGEP2(LLVMBuilderRef B, LLVMTypeRef Ty,
	LLVMValueRef Pointer, LLVMValueRef *Indices,
	unsigned NumIndices, const char *Name) {
	ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
	return wrap(unwrap(B)->CreateGEP(unwrap(Ty), unwrap(Pointer), IdxList, Name));
	}

	LLVMValueRef LLVMBuildInBoundsGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
	LLVMValueRef *Indices, unsigned NumIndices,
	const char *Name) {
	ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
	Value *Val = unwrap(Pointer);
	Type *Ty =
	cast<PointerType>(Val->getType()->getScalarType())->getElementType();
	return wrap(unwrap(B)->CreateInBoundsGEP(Ty, Val, IdxList, Name));
	}

	LLVMValueRef LLVMBuildInBoundsGEP2(LLVMBuilderRef B, LLVMTypeRef Ty,
	LLVMValueRef Pointer, LLVMValueRef *Indices,
	unsigned NumIndices, const char *Name) {
	ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
	return wrap(
	unwrap(B)->CreateInBoundsGEP(unwrap(Ty), unwrap(Pointer), IdxList, Name));
	}

	LLVMValueRef LLVMBuildStructGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
	unsigned Idx, const char *Name) {
	Value *Val = unwrap(Pointer);
	Type *Ty =
	cast<PointerType>(Val->getType()->getScalarType())->getElementType();
	return wrap(unwrap(B)->CreateStructGEP(Ty, Val, Idx, Name));
	}

	LLVMValueRef LLVMBuildStructGEP2(LLVMBuilderRef B, LLVMTypeRef Ty,
	LLVMValueRef Pointer, unsigned Idx,
	const char *Name) {
	return wrap(
	unwrap(B)->CreateStructGEP(unwrap(Ty), unwrap(Pointer), Idx, Name));
	}

	LLVMValueRef LLVMBuildGlobalString(LLVMBuilderRef B, const char *Str,
	const char *Name) {
	return wrap(unwrap(B)->CreateGlobalString(Str, Name));
	}

	LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str,
	const char *Name) {
	return wrap(unwrap(B)->CreateGlobalStringPtr(Str, Name));
	}

	LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) {
	Value *P = unwrap<Value>(MemAccessInst);
	if (LoadInst *LI = dyn_cast<LoadInst>(P))
	return LI->isVolatile();
	return cast<StoreInst>(P)->isVolatile();
	}

	void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
	Value *P = unwrap<Value>(MemAccessInst);
	if (LoadInst *LI = dyn_cast<LoadInst>(P))
	return LI->setVolatile(isVolatile);
	return cast<StoreInst>(P)->setVolatile(isVolatile);
	}

	LLVMAtomicOrdering LLVMGetOrdering(LLVMValueRef MemAccessInst) {
	Value *P = unwrap<Value>(MemAccessInst);
	AtomicOrdering O;
	if (LoadInst *LI = dyn_cast<LoadInst>(P))
	O = LI->getOrdering();
	else
	O = cast<StoreInst>(P)->getOrdering();
	return mapToLLVMOrdering(O);
	}

	void LLVMSetOrdering(LLVMValueRef MemAccessInst, LLVMAtomicOrdering Ordering) {
	Value *P = unwrap<Value>(MemAccessInst);
	AtomicOrdering O = mapFromLLVMOrdering(Ordering);

	if (LoadInst *LI = dyn_cast<LoadInst>(P))
	return LI->setOrdering(O);
	return cast<StoreInst>(P)->setOrdering(O);
	}

	/--.. Casts ...............................................................--/

	LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateTrunc(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildZExt(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateZExt(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildSExt(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateSExt(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildFPToUI(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateFPToUI(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildFPToSI(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateFPToSI(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildUIToFP(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateUIToFP(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildSIToFP(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateSIToFP(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildFPTrunc(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateFPTrunc(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildFPExt(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateFPExt(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildPtrToInt(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreatePtrToInt(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildIntToPtr(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateIntToPtr(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildBitCast(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateBitCast(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildAddrSpaceCast(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateAddrSpaceCast(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildZExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateZExtOrBitCast(unwrap(Val), unwrap(DestTy),
	Name));
	}

	LLVMValueRef LLVMBuildSExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateSExtOrBitCast(unwrap(Val), unwrap(DestTy),
	Name));
	}

	LLVMValueRef LLVMBuildTruncOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateTruncOrBitCast(unwrap(Val), unwrap(DestTy),
	Name));
	}

	LLVMValueRef LLVMBuildCast(LLVMBuilderRef B, LLVMOpcode Op, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateCast(Instruction::CastOps(map_from_llvmopcode(Op)), unwrap(Val),
	unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildPointerCast(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreatePointerCast(unwrap(Val), unwrap(DestTy), Name));
	}

	LLVMValueRef LLVMBuildIntCast2(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, LLVMBool IsSigned,
	const char *Name) {
	return wrap(
	unwrap(B)->CreateIntCast(unwrap(Val), unwrap(DestTy), IsSigned, Name));
	}

	LLVMValueRef LLVMBuildIntCast(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateIntCast(unwrap(Val), unwrap(DestTy),
	/isSigned/true, Name));
	}

	LLVMValueRef LLVMBuildFPCast(LLVMBuilderRef B, LLVMValueRef Val,
	LLVMTypeRef DestTy, const char *Name) {
	return wrap(unwrap(B)->CreateFPCast(unwrap(Val), unwrap(DestTy), Name));
	}

	/--.. Comparisons .........................................................--/

	LLVMValueRef LLVMBuildICmp(LLVMBuilderRef B, LLVMIntPredicate Op,
	LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateICmp(static_cast<ICmpInst::Predicate>(Op),
	unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildFCmp(LLVMBuilderRef B, LLVMRealPredicate Op,
	LLVMValueRef LHS, LLVMValueRef RHS,
	const char *Name) {
	return wrap(unwrap(B)->CreateFCmp(static_cast<FCmpInst::Predicate>(Op),
	unwrap(LHS), unwrap(RHS), Name));
	}

	/--.. Miscellaneous instructions ..........................................--/

	LLVMValueRef LLVMBuildPhi(LLVMBuilderRef B, LLVMTypeRef Ty, const char *Name) {
	return wrap(unwrap(B)->CreatePHI(unwrap(Ty), 0, Name));
	}

	LLVMValueRef LLVMBuildCall(LLVMBuilderRef B, LLVMValueRef Fn,
	LLVMValueRef *Args, unsigned NumArgs,
	const char *Name) {
	Value *V = unwrap(Fn);
	FunctionType *FnT =
	cast<FunctionType>(cast<PointerType>(V->getType())->getElementType());

	return wrap(unwrap(B)->CreateCall(FnT, unwrap(Fn),
	makeArrayRef(unwrap(Args), NumArgs), Name));
	}

	LLVMValueRef LLVMBuildCall2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn,
	LLVMValueRef *Args, unsigned NumArgs,
	const char *Name) {
	FunctionType *FTy = unwrap<FunctionType>(Ty);
	return wrap(unwrap(B)->CreateCall(FTy, unwrap(Fn),
	makeArrayRef(unwrap(Args), NumArgs), Name));
	}

	LLVMValueRef LLVMBuildSelect(LLVMBuilderRef B, LLVMValueRef If,
	LLVMValueRef Then, LLVMValueRef Else,
	const char *Name) {
	return wrap(unwrap(B)->CreateSelect(unwrap(If), unwrap(Then), unwrap(Else),
	Name));
	}

	LLVMValueRef LLVMBuildVAArg(LLVMBuilderRef B, LLVMValueRef List,
	LLVMTypeRef Ty, const char *Name) {
	return wrap(unwrap(B)->CreateVAArg(unwrap(List), unwrap(Ty), Name));
	}

	LLVMValueRef LLVMBuildExtractElement(LLVMBuilderRef B, LLVMValueRef VecVal,
	LLVMValueRef Index, const char *Name) {
	return wrap(unwrap(B)->CreateExtractElement(unwrap(VecVal), unwrap(Index),
	Name));
	}

	LLVMValueRef LLVMBuildInsertElement(LLVMBuilderRef B, LLVMValueRef VecVal,
	LLVMValueRef EltVal, LLVMValueRef Index,
	const char *Name) {
	return wrap(unwrap(B)->CreateInsertElement(unwrap(VecVal), unwrap(EltVal),
	unwrap(Index), Name));
	}

	LLVMValueRef LLVMBuildShuffleVector(LLVMBuilderRef B, LLVMValueRef V1,
	LLVMValueRef V2, LLVMValueRef Mask,
	const char *Name) {
	return wrap(unwrap(B)->CreateShuffleVector(unwrap(V1), unwrap(V2),
	unwrap(Mask), Name));
	}

	LLVMValueRef LLVMBuildExtractValue(LLVMBuilderRef B, LLVMValueRef AggVal,
	unsigned Index, const char *Name) {
	return wrap(unwrap(B)->CreateExtractValue(unwrap(AggVal), Index, Name));
	}

	LLVMValueRef LLVMBuildInsertValue(LLVMBuilderRef B, LLVMValueRef AggVal,
	LLVMValueRef EltVal, unsigned Index,
	const char *Name) {
	return wrap(unwrap(B)->CreateInsertValue(unwrap(AggVal), unwrap(EltVal),
	Index, Name));
	}

	LLVMValueRef LLVMBuildIsNull(LLVMBuilderRef B, LLVMValueRef Val,
	const char *Name) {
	return wrap(unwrap(B)->CreateIsNull(unwrap(Val), Name));
	}

	LLVMValueRef LLVMBuildIsNotNull(LLVMBuilderRef B, LLVMValueRef Val,
	const char *Name) {
	return wrap(unwrap(B)->CreateIsNotNull(unwrap(Val), Name));
	}

	LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef B, LLVMValueRef LHS,
	LLVMValueRef RHS, const char *Name) {
	return wrap(unwrap(B)->CreatePtrDiff(unwrap(LHS), unwrap(RHS), Name));
	}

	LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
	LLVMValueRef PTR, LLVMValueRef Val,
	LLVMAtomicOrdering ordering,
	LLVMBool singleThread) {
	AtomicRMWInst::BinOp intop;
	switch (op) {
	case LLVMAtomicRMWBinOpXchg: intop = AtomicRMWInst::Xchg; break;
	case LLVMAtomicRMWBinOpAdd: intop = AtomicRMWInst::Add; break;
	case LLVMAtomicRMWBinOpSub: intop = AtomicRMWInst::Sub; break;
	case LLVMAtomicRMWBinOpAnd: intop = AtomicRMWInst::And; break;
	case LLVMAtomicRMWBinOpNand: intop = AtomicRMWInst::Nand; break;
	case LLVMAtomicRMWBinOpOr: intop = AtomicRMWInst::Or; break;
	case LLVMAtomicRMWBinOpXor: intop = AtomicRMWInst::Xor; break;
	case LLVMAtomicRMWBinOpMax: intop = AtomicRMWInst::Max; break;
	case LLVMAtomicRMWBinOpMin: intop = AtomicRMWInst::Min; break;
	case LLVMAtomicRMWBinOpUMax: intop = AtomicRMWInst::UMax; break;
	case LLVMAtomicRMWBinOpUMin: intop = AtomicRMWInst::UMin; break;
	}
	return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val),
	mapFromLLVMOrdering(ordering), singleThread ? SyncScope::SingleThread
	: SyncScope::System));
	}

	LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
	LLVMValueRef Cmp, LLVMValueRef New,
	LLVMAtomicOrdering SuccessOrdering,
	LLVMAtomicOrdering FailureOrdering,
	LLVMBool singleThread) {

	return wrap(unwrap(B)->CreateAtomicCmpXchg(unwrap(Ptr), unwrap(Cmp),
	unwrap(New), mapFromLLVMOrdering(SuccessOrdering),
	mapFromLLVMOrdering(FailureOrdering),
	singleThread ? SyncScope::SingleThread : SyncScope::System));
	}


	LLVMBool LLVMIsAtomicSingleThread(LLVMValueRef AtomicInst) {
	Value *P = unwrap<Value>(AtomicInst);

	if (AtomicRMWInst *I = dyn_cast<AtomicRMWInst>(P))
	return I->getSyncScopeID() == SyncScope::SingleThread;
	return cast<AtomicCmpXchgInst>(P)->getSyncScopeID() ==
	SyncScope::SingleThread;
	}

	void LLVMSetAtomicSingleThread(LLVMValueRef AtomicInst, LLVMBool NewValue) {
	Value *P = unwrap<Value>(AtomicInst);
	SyncScope::ID SSID = NewValue ? SyncScope::SingleThread : SyncScope::System;

	if (AtomicRMWInst *I = dyn_cast<AtomicRMWInst>(P))
	return I->setSyncScopeID(SSID);
	return cast<AtomicCmpXchgInst>(P)->setSyncScopeID(SSID);
	}

	LLVMAtomicOrdering LLVMGetCmpXchgSuccessOrdering(LLVMValueRef CmpXchgInst) {
	Value *P = unwrap<Value>(CmpXchgInst);
	return mapToLLVMOrdering(cast<AtomicCmpXchgInst>(P)->getSuccessOrdering());
	}

	void LLVMSetCmpXchgSuccessOrdering(LLVMValueRef CmpXchgInst,
	LLVMAtomicOrdering Ordering) {
	Value *P = unwrap<Value>(CmpXchgInst);
	AtomicOrdering O = mapFromLLVMOrdering(Ordering);

	return cast<AtomicCmpXchgInst>(P)->setSuccessOrdering(O);
	}

	LLVMAtomicOrdering LLVMGetCmpXchgFailureOrdering(LLVMValueRef CmpXchgInst) {
	Value *P = unwrap<Value>(CmpXchgInst);
	return mapToLLVMOrdering(cast<AtomicCmpXchgInst>(P)->getFailureOrdering());
	}

	void LLVMSetCmpXchgFailureOrdering(LLVMValueRef CmpXchgInst,
	LLVMAtomicOrdering Ordering) {
	Value *P = unwrap<Value>(CmpXchgInst);
	AtomicOrdering O = mapFromLLVMOrdering(Ordering);

	return cast<AtomicCmpXchgInst>(P)->setFailureOrdering(O);
	}

	/===-- Module providers --------------------------------------------------===/

	LLVMModuleProviderRef
	LLVMCreateModuleProviderForExistingModule(LLVMModuleRef M) {
	return reinterpret_cast<LLVMModuleProviderRef>(M);
	}

	void LLVMDisposeModuleProvider(LLVMModuleProviderRef MP) {
	delete unwrap(MP);
	}


	/===-- Memory buffers ----------------------------------------------------===/

	LLVMBool LLVMCreateMemoryBufferWithContentsOfFile(
	const char *Path,
	LLVMMemoryBufferRef *OutMemBuf,
	char **OutMessage) {

	ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr = MemoryBuffer::getFile(Path);
	if (std::error_code EC = MBOrErr.getError()) {
	*OutMessage = strdup(EC.message().c_str());
	return 1;
	}
	*OutMemBuf = wrap(MBOrErr.get().release());
	return 0;
	}

	LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
	char **OutMessage) {
	ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr = MemoryBuffer::getSTDIN();
	if (std::error_code EC = MBOrErr.getError()) {
	*OutMessage = strdup(EC.message().c_str());
	return 1;
	}
	*OutMemBuf = wrap(MBOrErr.get().release());
	return 0;
	}

	LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRange(
	const char *InputData,
	size_t InputDataLength,
	const char *BufferName,
	LLVMBool RequiresNullTerminator) {

	return wrap(MemoryBuffer::getMemBuffer(StringRef(InputData, InputDataLength),
	StringRef(BufferName),
	RequiresNullTerminator).release());
	}

	LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRangeCopy(
	const char *InputData,
	size_t InputDataLength,
	const char *BufferName) {

	return wrap(
	MemoryBuffer::getMemBufferCopy(StringRef(InputData, InputDataLength),
	StringRef(BufferName)).release());
	}

	const char *LLVMGetBufferStart(LLVMMemoryBufferRef MemBuf) {
	return unwrap(MemBuf)->getBufferStart();
	}

	size_t LLVMGetBufferSize(LLVMMemoryBufferRef MemBuf) {
	return unwrap(MemBuf)->getBufferSize();
	}

	void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf) {
	delete unwrap(MemBuf);
	}

	/===-- Pass Registry -----------------------------------------------------===/

	LLVMPassRegistryRef LLVMGetGlobalPassRegistry(void) {
	return wrap(PassRegistry::getPassRegistry());
	}

	/===-- Pass Manager ------------------------------------------------------===/

	LLVMPassManagerRef LLVMCreatePassManager() {
	return wrap(new legacy::PassManager());
	}

	LLVMPassManagerRef LLVMCreateFunctionPassManagerForModule(LLVMModuleRef M) {
	return wrap(new legacy::FunctionPassManager(unwrap(M)));
	}

	LLVMPassManagerRef LLVMCreateFunctionPassManager(LLVMModuleProviderRef P) {
	return LLVMCreateFunctionPassManagerForModule(
	reinterpret_cast<LLVMModuleRef>(P));
	}

	LLVMBool LLVMRunPassManager(LLVMPassManagerRef PM, LLVMModuleRef M) {
	return unwrap<legacy::PassManager>(PM)->run(*unwrap(M));
	}

	LLVMBool LLVMInitializeFunctionPassManager(LLVMPassManagerRef FPM) {
	return unwrap<legacy::FunctionPassManager>(FPM)->doInitialization();
	}

	LLVMBool LLVMRunFunctionPassManager(LLVMPassManagerRef FPM, LLVMValueRef F) {
	return unwrap<legacy::FunctionPassManager>(FPM)->run(*unwrap<Function>(F));
	}

	LLVMBool LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM) {
	return unwrap<legacy::FunctionPassManager>(FPM)->doFinalization();
	}

	void LLVMDisposePassManager(LLVMPassManagerRef PM) {
	delete unwrap(PM);
	}

	/===-- Threading ------------------------------------------------------===/

	LLVMBool LLVMStartMultithreaded() {
	return LLVMIsMultithreaded();
	}

	void LLVMStopMultithreaded() {
	}

	LLVMBool LLVMIsMultithreaded() {
	return llvm_is_multithreaded();
	}
	Index: projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 351722)
	@@ -1,12083 +1,12091 @@
	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the AArch64TargetLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64ExpandImm.h"
	#include "AArch64ISelLowering.h"
	#include "AArch64CallingConvention.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64PerfectShuffle.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetCallingConv.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/OperandTraits.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cassert>
	#include <cctype>
	#include <cstdint>
	#include <cstdlib>
	#include <iterator>
	#include <limits>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "aarch64-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumShiftInserts, "Number of vector shift inserts");
	STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");

	static cl::opt<bool>
	EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
	cl::desc("Allow AArch64 SLI/SRI formation"),
	cl::init(false));

	// FIXME: The necessary dtprel relocations don't seem to be supported
	// well in the GNU bfd and gold linkers at the moment. Therefore, by
	// default, for now, fall back to GeneralDynamic code generation.
	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
	"aarch64-elf-ldtls-generation", cl::Hidden,
	cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
	cl::init(false));

	static cl::opt<bool>
	EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
	cl::desc("Enable AArch64 logical imm instruction "
	"optimization"),
	cl::init(true));

	/// Value type used for condition codes.
	static const MVT MVT_CC = MVT::i32;

	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
	const AArch64Subtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
	// we have to make something up. Arbitrarily, choose ZeroOrOne.
	setBooleanContents(ZeroOrOneBooleanContent);
	// When comparing vectors the result sets the different elements in the
	// vector to all-one or all-zero.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// Set up the register classes.
	addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
	addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);

	if (Subtarget->hasFPARMv8()) {
	addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
	addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
	addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
	addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
	}

	if (Subtarget->hasNEON()) {
	addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
	addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
	// Someone set us up the NEON.
	addDRTypeForNEON(MVT::v2f32);
	addDRTypeForNEON(MVT::v8i8);
	addDRTypeForNEON(MVT::v4i16);
	addDRTypeForNEON(MVT::v2i32);
	addDRTypeForNEON(MVT::v1i64);
	addDRTypeForNEON(MVT::v1f64);
	addDRTypeForNEON(MVT::v4f16);

	addQRTypeForNEON(MVT::v4f32);
	addQRTypeForNEON(MVT::v2f64);
	addQRTypeForNEON(MVT::v16i8);
	addQRTypeForNEON(MVT::v8i16);
	addQRTypeForNEON(MVT::v4i32);
	addQRTypeForNEON(MVT::v2i64);
	addQRTypeForNEON(MVT::v8f16);
	}

	// Compute derived properties from the register classes
	computeRegisterProperties(Subtarget->getRegisterInfo());

	// Provide all sorts of operation actions
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::i32, Custom);
	setOperationAction(ISD::SETCC, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::f16, Custom);
	setOperationAction(ISD::SETCC, MVT::f32, Custom);
	setOperationAction(ISD::SETCC, MVT::f64, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f16, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
	setOperationAction(ISD::SELECT, MVT::i32, Custom);
	setOperationAction(ISD::SELECT, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::f16, Custom);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
	setOperationAction(ISD::BR_JT, MVT::Other, Custom);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);

	setOperationAction(ISD::FREM, MVT::f32, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f80, Expand);

	setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);

	// Custom lowering hooks are needed for XOR
	// to fold it into CSINC/CSINV.
	setOperationAction(ISD::XOR, MVT::i32, Custom);
	setOperationAction(ISD::XOR, MVT::i64, Custom);

	// Virtually no operation on f128 is legal, but LLVM can't expand them when
	// there's a valid register class, so we need custom operations in most cases.
	setOperationAction(ISD::FABS, MVT::f128, Expand);
	setOperationAction(ISD::FADD, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
	setOperationAction(ISD::FCOS, MVT::f128, Expand);
	setOperationAction(ISD::FDIV, MVT::f128, Custom);
	setOperationAction(ISD::FMA, MVT::f128, Expand);
	setOperationAction(ISD::FMUL, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Expand);
	setOperationAction(ISD::FPOW, MVT::f128, Expand);
	setOperationAction(ISD::FREM, MVT::f128, Expand);
	setOperationAction(ISD::FRINT, MVT::f128, Expand);
	setOperationAction(ISD::FSIN, MVT::f128, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
	setOperationAction(ISD::FSQRT, MVT::f128, Expand);
	setOperationAction(ISD::FSUB, MVT::f128, Custom);
	setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
	setOperationAction(ISD::SETCC, MVT::f128, Custom);
	setOperationAction(ISD::BR_CC, MVT::f128, Custom);
	setOperationAction(ISD::SELECT, MVT::f128, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

	// Lowering for many of the conversions is actually specified by the non-f128
	// type. The LowerXXX function will be trivial when f128 isn't involved.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

	// Variable arguments.
	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VACOPY, MVT::Other, Custom);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	// Variable-sized objects.
	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
	else
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);

	// Constant pool entries
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);

	// BlockAddress
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);

	// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
	setOperationAction(ISD::ADDC, MVT::i32, Custom);
	setOperationAction(ISD::ADDE, MVT::i32, Custom);
	setOperationAction(ISD::SUBC, MVT::i32, Custom);
	setOperationAction(ISD::SUBE, MVT::i32, Custom);
	setOperationAction(ISD::ADDC, MVT::i64, Custom);
	setOperationAction(ISD::ADDE, MVT::i64, Custom);
	setOperationAction(ISD::SUBC, MVT::i64, Custom);
	setOperationAction(ISD::SUBE, MVT::i64, Custom);

	// AArch64 lacks both left-rotate and popcount instructions.
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	setOperationAction(ISD::ROTL, MVT::i64, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	}

	// AArch64 doesn't have {U\|S}MUL_LOHI.
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

	setOperationAction(ISD::CTPOP, MVT::i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::i64, Custom);

	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	}
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);

	// Custom lower Add/Sub/Mul with overflow.
	setOperationAction(ISD::SADDO, MVT::i32, Custom);
	setOperationAction(ISD::SADDO, MVT::i64, Custom);
	setOperationAction(ISD::UADDO, MVT::i32, Custom);
	setOperationAction(ISD::UADDO, MVT::i64, Custom);
	setOperationAction(ISD::SSUBO, MVT::i32, Custom);
	setOperationAction(ISD::SSUBO, MVT::i64, Custom);
	setOperationAction(ISD::USUBO, MVT::i32, Custom);
	setOperationAction(ISD::USUBO, MVT::i64, Custom);
	setOperationAction(ISD::SMULO, MVT::i32, Custom);
	setOperationAction(ISD::SMULO, MVT::i64, Custom);
	setOperationAction(ISD::UMULO, MVT::i32, Custom);
	setOperationAction(ISD::UMULO, MVT::i64, Custom);

	setOperationAction(ISD::FSIN, MVT::f32, Expand);
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f32, Expand);
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
	if (Subtarget->hasFullFP16())
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
	else
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);

	setOperationAction(ISD::FREM, MVT::f16, Promote);
	setOperationAction(ISD::FREM, MVT::v4f16, Expand);
	setOperationAction(ISD::FREM, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOW, MVT::f16, Promote);
	setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::f16, Promote);
	setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOS, MVT::f16, Promote);
	setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FSIN, MVT::f16, Promote);
	setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
	setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
	setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP, MVT::f16, Promote);
	setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::f16, Promote);
	setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG, MVT::f16, Promote);
	setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::f16, Promote);
	setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);

	if (!Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SELECT, MVT::f16, Promote);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
	setOperationAction(ISD::SETCC, MVT::f16, Promote);
	setOperationAction(ISD::BR_CC, MVT::f16, Promote);
	setOperationAction(ISD::FADD, MVT::f16, Promote);
	setOperationAction(ISD::FSUB, MVT::f16, Promote);
	setOperationAction(ISD::FMUL, MVT::f16, Promote);
	setOperationAction(ISD::FDIV, MVT::f16, Promote);
	setOperationAction(ISD::FMA, MVT::f16, Promote);
	setOperationAction(ISD::FNEG, MVT::f16, Promote);
	setOperationAction(ISD::FABS, MVT::f16, Promote);
	setOperationAction(ISD::FCEIL, MVT::f16, Promote);
	setOperationAction(ISD::FSQRT, MVT::f16, Promote);
	setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
	setOperationAction(ISD::FRINT, MVT::f16, Promote);
	setOperationAction(ISD::FROUND, MVT::f16, Promote);
	setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
	setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);

	// promote v4f16 to v4f32 when that is known to be safe.
	setOperationAction(ISD::FADD, MVT::v4f16, Promote);
	setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
	setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
	setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
	setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
	setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
	AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);

	setOperationAction(ISD::FABS, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
	setOperationAction(ISD::FMA, MVT::v4f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);

	setOperationAction(ISD::FABS, MVT::v8f16, Expand);
	setOperationAction(ISD::FADD, MVT::v8f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
	setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
	setOperationAction(ISD::FMA, MVT::v8f16, Expand);
	setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::f32, MVT::f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	setOperationAction(ISD::FMINNUM, Ty, Legal);
	setOperationAction(ISD::FMAXNUM, Ty, Legal);
	setOperationAction(ISD::FMINIMUM, Ty, Legal);
	setOperationAction(ISD::FMAXIMUM, Ty, Legal);
	setOperationAction(ISD::LROUND, Ty, Legal);
	setOperationAction(ISD::LLROUND, Ty, Legal);
	setOperationAction(ISD::LRINT, Ty, Legal);
	setOperationAction(ISD::LLRINT, Ty, Legal);
	}

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
	setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
	setOperationAction(ISD::FCEIL, MVT::f16, Legal);
	setOperationAction(ISD::FRINT, MVT::f16, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
	setOperationAction(ISD::FROUND, MVT::f16, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
	}

	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

	setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);

	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);

	// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
	// This requires the Performance Monitors extension.
	if (Subtarget->hasPerfMon())
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);

	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	// Issue __sincos_stret if available.
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	} else {
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	}

	// Make floating-point constants legal for the large code model, so they don't
	// become loads from the constant pool.
	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
	setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
	setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
	}

	// AArch64 does not have floating-point extending loads, i1 sign-extending
	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
	}
	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);

	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::f16, Custom);

	// Indexed loads and stores are supported.
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, MVT::i8, Legal);
	setIndexedLoadAction(im, MVT::i16, Legal);
	setIndexedLoadAction(im, MVT::i32, Legal);
	setIndexedLoadAction(im, MVT::i64, Legal);
	setIndexedLoadAction(im, MVT::f64, Legal);
	setIndexedLoadAction(im, MVT::f32, Legal);
	setIndexedLoadAction(im, MVT::f16, Legal);
	setIndexedStoreAction(im, MVT::i8, Legal);
	setIndexedStoreAction(im, MVT::i16, Legal);
	setIndexedStoreAction(im, MVT::i32, Legal);
	setIndexedStoreAction(im, MVT::i64, Legal);
	setIndexedStoreAction(im, MVT::f64, Legal);
	setIndexedStoreAction(im, MVT::f32, Legal);
	setIndexedStoreAction(im, MVT::f16, Legal);
	}

	// Trap.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// We combine OR nodes for bitfield operations.
	setTargetDAGCombine(ISD::OR);
	// Try to create BICs for vector ANDs.
	setTargetDAGCombine(ISD::AND);

	// Vector add and sub nodes may conceal a high-half opportunity.
	// Also, try to fold ADD into CSINC/CSINV..
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);

	setTargetDAGCombine(ISD::FP_TO_SINT);
	setTargetDAGCombine(ISD::FP_TO_UINT);
	setTargetDAGCombine(ISD::FDIV);

	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);

	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::STORE);
	if (Subtarget->supportsAddressTopByteIgnored())
	setTargetDAGCombine(ISD::LOAD);

	setTargetDAGCombine(ISD::MUL);

	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::VSELECT);

	setTargetDAGCombine(ISD::INTRINSIC_VOID);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);

	setTargetDAGCombine(ISD::GlobalAddress);

	// In case of strict alignment, avoid an excessive number of byte wide stores.
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemset = Subtarget->requiresStrictAlign()
	? MaxStoresPerMemsetOptSize : 32;

	MaxGluedStoresPerMemcpy = 4;
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
	? MaxStoresPerMemcpyOptSize : 16;

	MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;

	MaxLoadsPerMemcmpOptSize = 4;
	MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
	? MaxLoadsPerMemcmpOptSize : 8;

	setStackPointerRegisterToSaveRestore(AArch64::SP);

	setSchedulingPreference(Sched::Hybrid);

	EnableExtLdPromotion = true;

	// Set required alignment.
	setMinFunctionAlignment(2);
	// Set preferred alignments.
	setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
	setPrefLoopAlignment(STI.getPrefLoopAlignment());

	// Only change the limit for entries in a jump table if specified by
	// the sub target, but not at the command line.
	unsigned MaxJT = STI.getMaximumJumpTableSize();
	if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
	setMaximumJumpTableSize(MaxJT);

	setHasExtractBitsInsn(true);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	if (Subtarget->hasNEON()) {
	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
	// silliness like this:
	setOperationAction(ISD::FABS, MVT::v1f64, Expand);
	setOperationAction(ISD::FADD, MVT::v1f64, Expand);
	setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
	setOperationAction(ISD::FMA, MVT::v1f64, Expand);
	setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
	setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
	setOperationAction(ISD::FREM, MVT::v1f64, Expand);
	setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
	setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
	setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);

	setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);

	setOperationAction(ISD::MUL, MVT::v1i64, Expand);

	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
	// elements smaller than i32, so promote the input to i32 first.
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
	// i8 vector elements also need promotion to i32 for v8i8
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
	} else {
	// when AArch64 doesn't have fullfp16 support, promote the input
	// to i32 first.
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
	}

	setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);

	// AArch64 doesn't have MUL.2d:
	setOperationAction(ISD::MUL, MVT::v2i64, Expand);
	// Custom handling for some quad-vector types to detect MULL.
	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);

	// Vector reductions
	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
	MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
	}
	for (MVT VT : { MVT::v4f16, MVT::v2f32,
	MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
	}

	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
	// Likewise, narrowing and extending vector loads/stores aren't handled
	// directly.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32) {
	setOperationAction(ISD::MULHS, VT, Legal);
	setOperationAction(ISD::MULHU, VT, Legal);
	} else {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	}
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);

	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}

	if (Subtarget->hasFullFP16()) {
	for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}
	}

	setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
	}

	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
	}

	void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
	assert(VT.isVector() && "VT should be a vector type");

	if (VT.isFloatingPoint()) {
	MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
	setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
	setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
	}

	// Mark vector float intrinsics as expand.
	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);

	// But we do support custom-lowering for FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::OR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	for (MVT InnerVT : MVT::all_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// CNT supports only B element sizes, then use UADDLP to widen.
	if (VT != MVT::v8i8 && VT != MVT::v16i8)
	setOperationAction(ISD::CTPOP, VT, Custom);

	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);

	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);

	if (!VT.isFloatingPoint())
	setOperationAction(ISD::ABS, VT, Legal);

	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
	setOperationAction(Opcode, VT, Legal);

	// F[MIN\|MAX][NUM\|NAN] are available for all FP NEON types.
	if (VT.isFloatingPoint() &&
	(VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()))
	for (unsigned Opcode :
	{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
	setOperationAction(Opcode, VT, Legal);

	if (Subtarget->isLittleEndian()) {
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, VT, Legal);
	setIndexedStoreAction(im, VT, Legal);
	}
	}
	}

	void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR64RegClass);
	addTypeForNEON(VT, MVT::v2i32);
	}

	void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR128RegClass);
	addTypeForNEON(VT, MVT::v4i32);
	}

	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	return VT.changeVectorElementTypeToInteger();
	}

	static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
	const APInt &Demanded,
	TargetLowering::TargetLoweringOpt &TLO,
	unsigned NewOpc) {
	uint64_t OldImm = Imm, NewImm, Enc;
	uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;

	// Return if the immediate is already all zeros, all ones, a bimm32 or a
	// bimm64.
	if (Imm == 0 \|\| Imm == Mask \|\|
	AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
	return false;

	unsigned EltSize = Size;
	uint64_t DemandedBits = Demanded.getZExtValue();

	// Clear bits that are not demanded.
	Imm &= DemandedBits;

	while (true) {
	// The goal here is to set the non-demanded bits in a way that minimizes
	// the number of switching between 0 and 1. In order to achieve this goal,
	// we set the non-demanded bits to the value of the preceding demanded bits.
	// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
	// non-demanded bit), we copy bit0 (1) to the least significant 'x',
	// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
	// The final result is 0b11000011.
	uint64_t NonDemandedBits = ~DemandedBits;
	uint64_t InvertedImm = ~Imm & DemandedBits;
	uint64_t RotatedImm =
	((InvertedImm << 1) \| (InvertedImm >> (EltSize - 1) & 1)) &
	NonDemandedBits;
	uint64_t Sum = RotatedImm + NonDemandedBits;
	bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
	uint64_t Ones = (Sum + Carry) & NonDemandedBits;
	NewImm = (Imm \| Ones) & Mask;

	// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
	// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
	// we halve the element size and continue the search.
	if (isShiftedMask_64(NewImm) \|\| isShiftedMask_64(~(NewImm \| ~Mask)))
	break;

	// We cannot shrink the element size any further if it is 2-bits.
	if (EltSize == 2)
	return false;

	EltSize /= 2;
	Mask >>= EltSize;
	uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;

	// Return if there is mismatch in any of the demanded bits of Imm and Hi.
	if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
	return false;

	// Merge the upper and lower halves of Imm and DemandedBits.
	Imm \|= Hi;
	DemandedBits \|= DemandedBitsHi;
	}

	++NumOptimizedImms;

	// Replicate the element across the register width.
	while (EltSize < Size) {
	NewImm \|= NewImm << EltSize;
	EltSize *= 2;
	}

	(void)OldImm;
	assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
	"demanded bits should never be altered");
	assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");

	// Create the new constant immediate node.
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue New;

	// If the new constant immediate is all-zeros or all-ones, let the target
	// independent DAG combine optimize this node.
	if (NewImm == 0 \|\| NewImm == OrigMask) {
	New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
	TLO.DAG.getConstant(NewImm, DL, VT));
	// Otherwise, create a machine node so that target independent DAG combine
	// doesn't undo this optimization.
	} else {
	Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
	SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
	New = SDValue(
	TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
	}

	return TLO.CombineTo(Op, New);
	}

	bool AArch64TargetLowering::targetShrinkDemandedConstant(
	SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
	// Delay this optimization to as late as possible.
	if (!TLO.LegalOps)
	return false;

	if (!EnableOptimizeLogicalImm)
	return false;

	EVT VT = Op.getValueType();
	if (VT.isVector())
	return false;

	unsigned Size = VT.getSizeInBits();
	assert((Size == 32 \|\| Size == 64) &&
	"i32 or i64 is expected after legalization.");

	// Exit early if we demand all bits.
	if (Demanded.countPopulation() == Size)
	return false;

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default:
	return false;
	case ISD::AND:
	NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
	break;
	case ISD::OR:
	NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
	break;
	case ISD::XOR:
	NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
	break;
	}
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;
	uint64_t Imm = C->getZExtValue();
	return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
	}

	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
	/// Mask are known to be either zero or one and return them Known.
	void AArch64TargetLowering::computeKnownBitsForTargetNode(
	const SDValue Op, KnownBits &Known,
	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case AArch64ISD::CSEL: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
	Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default: return;
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	unsigned BitWidth = Known.getBitWidth();
	EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero \|= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
	return;
	}
	}
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_neon_umaxv:
	case Intrinsic::aarch64_neon_uminv: {
	// Figure out the datatype of the vector operand. The UMINV instruction
	// will zero extend the result, so we can mark as known zero all the
	// bits larger than the element datatype. 32-bit or larget doesn't need
	// this as those are legal types and will be handled by isel directly.
	MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
	unsigned BitWidth = Known.getBitWidth();
	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
	assert(BitWidth >= 8 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
	Known.Zero \|= Mask;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
	assert(BitWidth >= 16 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
	Known.Zero \|= Mask;
	}
	break;
	} break;
	}
	}
	}
	}

	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
	EVT) const {
	return MVT::i64;
	}

	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Subtarget->requiresStrictAlign())
	return false;

	if (Fast) {
	// Some CPUs are fine with unaligned stores except for 128-bit ones.
	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != 16 \|\|
	// See comments in performSTORECombine() for more details about
	// these conditions.

	// Code that uses clang vector extensions can mark that it
	// wants unaligned accesses to be treated as fast by
	// underspecifying alignment to be 1 or 2.
	Align <= 2 \|\|

	// Disregard v2i64. Memcpy lowering produces those and splitting
	// them regresses performance on micro-benchmarks and olden/bh.
	VT == MVT::v2i64;
	}
	return true;
	}

	FastISel *
	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return AArch64::createFastISel(funcInfo, libInfo);
	}

	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((AArch64ISD::NodeType)Opcode) {
	case AArch64ISD::FIRST_NUMBER: break;
	case AArch64ISD::CALL: return "AArch64ISD::CALL";
	case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
	case AArch64ISD::ADR: return "AArch64ISD::ADR";
	case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
	case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
	case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
	case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
	case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
	case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
	case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
	case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
	case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
	case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
	case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
	case AArch64ISD::ADC: return "AArch64ISD::ADC";
	case AArch64ISD::SBC: return "AArch64ISD::SBC";
	case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
	case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
	case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
	case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
	case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
	case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
	case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
	case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
	case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
	case AArch64ISD::DUP: return "AArch64ISD::DUP";
	case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
	case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
	case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
	case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
	case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
	case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
	case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
	case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
	case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
	case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
	case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
	case AArch64ISD::BICi: return "AArch64ISD::BICi";
	case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
	case AArch64ISD::BSL: return "AArch64ISD::BSL";
	case AArch64ISD::NEG: return "AArch64ISD::NEG";
	case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
	case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
	case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
	case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
	case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
	case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
	case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
	case AArch64ISD::REV16: return "AArch64ISD::REV16";
	case AArch64ISD::REV32: return "AArch64ISD::REV32";
	case AArch64ISD::REV64: return "AArch64ISD::REV64";
	case AArch64ISD::EXT: return "AArch64ISD::EXT";
	case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
	case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
	case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
	case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
	case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
	case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
	case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
	case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
	case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
	case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
	case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
	case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
	case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
	case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
	case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
	case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
	case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
	case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
	case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
	case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
	case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
	case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
	case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
	case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
	case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
	case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
	case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
	case AArch64ISD::NOT: return "AArch64ISD::NOT";
	case AArch64ISD::BIT: return "AArch64ISD::BIT";
	case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
	case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
	case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
	case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
	case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
	case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
	case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
	case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
	case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
	case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
	case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
	case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
	case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
	case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
	case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
	case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
	case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
	case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
	case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
	case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
	case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
	case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
	case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
	case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
	case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
	case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
	case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
	case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
	case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
	case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
	case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
	case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
	case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
	case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
	case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
	case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
	case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
	case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
	case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
	case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
	case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
	case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
	case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
	case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
	case AArch64ISD::STG: return "AArch64ISD::STG";
	case AArch64ISD::STZG: return "AArch64ISD::STZG";
	case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
	case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
	}
	return nullptr;
	}

	MachineBasicBlock *
	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// We materialise the F128CSEL pseudo-instruction as some control flow and a
	// phi node:

	// OrigBB:
	// [... previous instrs leading to comparison ...]
	// b.ne TrueBB
	// b EndBB
	// TrueBB:
	// ; Fallthrough
	// EndBB:
	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]

	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator It = ++MBB->getIterator();

	unsigned DestReg = MI.getOperand(0).getReg();
	unsigned IfTrueReg = MI.getOperand(1).getReg();
	unsigned IfFalseReg = MI.getOperand(2).getReg();
	unsigned CondCode = MI.getOperand(3).getImm();
	bool NZCVKilled = MI.getOperand(4).isKill();

	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MF->insert(It, TrueBB);
	MF->insert(It, EndBB);

	// Transfer rest of current basic-block to EndBB
	EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
	MBB->end());
	EndBB->transferSuccessorsAndUpdatePHIs(MBB);

	BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
	BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
	MBB->addSuccessor(TrueBB);
	MBB->addSuccessor(EndBB);

	// TrueBB falls through to the end.
	TrueBB->addSuccessor(EndBB);

	if (!NZCVKilled) {
	TrueBB->addLiveIn(AArch64::NZCV);
	EndBB->addLiveIn(AArch64::NZCV);
	}

	BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
	.addReg(IfTrueReg)
	.addMBB(TrueBB)
	.addReg(IfFalseReg)
	.addMBB(MBB);

	MI.eraseFromParent();
	return EndBB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
	BB->getParent()->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");
	return BB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	#ifndef NDEBUG
	MI.dump();
	#endif
	llvm_unreachable("Unexpected instruction for custom inserter!");

	case AArch64::F128CSEL:
	return EmitF128CSEL(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case AArch64::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case AArch64::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	}
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Lowering private implementation.
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Lowering Code
	//===----------------------------------------------------------------------===//

	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
	/// CC
	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unknown condition code!");
	case ISD::SETNE:
	return AArch64CC::NE;
	case ISD::SETEQ:
	return AArch64CC::EQ;
	case ISD::SETGT:
	return AArch64CC::GT;
	case ISD::SETGE:
	return AArch64CC::GE;
	case ISD::SETLT:
	return AArch64CC::LT;
	case ISD::SETLE:
	return AArch64CC::LE;
	case ISD::SETUGT:
	return AArch64CC::HI;
	case ISD::SETUGE:
	return AArch64CC::HS;
	case ISD::SETULT:
	return AArch64CC::LO;
	case ISD::SETULE:
	return AArch64CC::LS;
	}
	}

	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
	static void changeFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	llvm_unreachable("Unknown FP condition!");
	case ISD::SETEQ:
	case ISD::SETOEQ:
	CondCode = AArch64CC::EQ;
	break;
	case ISD::SETGT:
	case ISD::SETOGT:
	CondCode = AArch64CC::GT;
	break;
	case ISD::SETGE:
	case ISD::SETOGE:
	CondCode = AArch64CC::GE;
	break;
	case ISD::SETOLT:
	CondCode = AArch64CC::MI;
	break;
	case ISD::SETOLE:
	CondCode = AArch64CC::LS;
	break;
	case ISD::SETONE:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GT;
	break;
	case ISD::SETO:
	CondCode = AArch64CC::VC;
	break;
	case ISD::SETUO:
	CondCode = AArch64CC::VS;
	break;
	case ISD::SETUEQ:
	CondCode = AArch64CC::EQ;
	CondCode2 = AArch64CC::VS;
	break;
	case ISD::SETUGT:
	CondCode = AArch64CC::HI;
	break;
	case ISD::SETUGE:
	CondCode = AArch64CC::PL;
	break;
	case ISD::SETLT:
	case ISD::SETULT:
	CondCode = AArch64CC::LT;
	break;
	case ISD::SETLE:
	case ISD::SETULE:
	CondCode = AArch64CC::LE;
	break;
	case ISD::SETNE:
	case ISD::SETUNE:
	CondCode = AArch64CC::NE;
	break;
	}
	}

	/// Convert a DAG fp condition code to an AArch64 CC.
	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
	/// should be AND'ed instead of OR'ed.
	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	assert(CondCode2 == AArch64CC::AL);
	break;
	case ISD::SETONE:
	// (a one b)
	// == ((a olt b) \|\| (a ogt b))
	// == ((a ord b) && (a une b))
	CondCode = AArch64CC::VC;
	CondCode2 = AArch64CC::NE;
	break;
	case ISD::SETUEQ:
	// (a ueq b)
	// == ((a uno b) \|\| (a oeq b))
	// == ((a ule b) && (a uge b))
	CondCode = AArch64CC::PL;
	CondCode2 = AArch64CC::LE;
	break;
	}
	}

	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
	/// CC usable with the vector instructions. Fewer operations are available
	/// without a real NZCV register, so we have to use less efficient combinations
	/// to get the same effect.
	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2,
	bool &Invert) {
	Invert = false;
	switch (CC) {
	default:
	// Mostly the scalar mappings work fine.
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	break;
	case ISD::SETUO:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETO:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GE;
	break;
	case ISD::SETUEQ:
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	// All of the compare-mask comparisons are ordered, but we can switch
	// between the two by a double inversion. E.g. ULE == !OGT.
	Invert = true;
	changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
	break;
	}
	}

	static bool isLegalArithImmed(uint64_t C) {
	// Matches AArch64DAGToDAGISel::SelectArithImmed().
	bool IsLegal = (C >> 12 == 0) \|\| ((C & 0xFFFULL) == 0 && C >> 24 == 0);
	LLVM_DEBUG(dbgs() << "Is imm " << C
	<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
	return IsLegal;
	}

	// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
	// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
	// can be set differently by this operation. It comes down to whether
	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
	// everything is fine. If not then the optimization is wrong. Thus general
	// comparisons are only valid if op2 != 0.
	//
	// So, finally, the only LLVM-native comparisons that don't mention C and V
	// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
	// the absence of information about op2.
	static bool isCMN(SDValue Op, ISD::CondCode CC) {
	return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE);
	}

	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT VT = LHS.getValueType();
	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	if (VT.isFloatingPoint()) {
	assert(VT != MVT::f128);
	if (VT == MVT::f16 && !FullFP16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	VT = MVT::f32;
	}
	return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
	}

	// The CMP instruction is just an alias for SUBS, and representing it as
	// SUBS means that it's possible to get CSE with subtract operations.
	// A later phase can perform the optimization of setting the destination
	// register to WZR/XZR if it ends up being unused.
	unsigned Opcode = AArch64ISD::SUBS;

	if (isCMN(RHS, CC)) {
	// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
	Opcode = AArch64ISD::ADDS;
	RHS = RHS.getOperand(1);
	} else if (isCMN(LHS, CC)) {
	// As we are looking for EQ/NE compares, the operands can be commuted ; can
	// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
	Opcode = AArch64ISD::ADDS;
	LHS = LHS.getOperand(1);
	} else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
	!isUnsignedIntSetCC(CC)) {
	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
	// of the signed comparisons.
	Opcode = AArch64ISD::ANDS;
	RHS = LHS.getOperand(1);
	LHS = LHS.getOperand(0);
	}

	return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
	.getValue(1);
	}

	/// \defgroup AArch64CCMP CMP;CCMP matching
	///
	/// These functions deal with the formation of CMP;CCMP;... sequences.
	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
	/// a comparison. They set the NZCV flags to a predefined value if their
	/// predicate is false. This allows to express arbitrary conjunctions, for
	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
	/// expressed as:
	/// cmp A
	/// ccmp B, inv(CB), CA
	/// check for CB flags
	///
	/// This naturally lets us implement chains of AND operations with SETCC
	/// operands. And we can even implement some other situations by transforming
	/// them:
	/// - We can implement (NEG SETCC) i.e. negating a single comparison by
	/// negating the flags used in a CCMP/FCCMP operations.
	/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
	/// by negating the flags we test for afterwards. i.e.
	/// NEG (CMP CCMP CCCMP ...) can be implemented.
	/// - Note that we can only ever negate all previously processed results.
	/// What we can not implement by flipping the flags to test is a negation
	/// of two sub-trees (because the negation affects all sub-trees emitted so
	/// far, so the 2nd sub-tree we emit would also affect the first).
	/// With those tools we can implement some OR operations:
	/// - (OR (SETCC A) (SETCC B)) can be implemented via:
	/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
	/// - After transforming OR to NEG/AND combinations we may be able to use NEG
	/// elimination rules from earlier to implement the whole thing as a
	/// CCMP/FCCMP chain.
	///
	/// As complete example:
	/// or (or (setCA (cmp A)) (setCB (cmp B)))
	/// (and (setCC (cmp C)) (setCD (cmp D)))"
	/// can be reassociated to:
	/// or (and (setCC (cmp C)) setCD (cmp D))
	// (or (setCA (cmp A)) (setCB (cmp B)))
	/// can be transformed to:
	/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
	/// which can be implemented as:
	/// cmp C
	/// ccmp D, inv(CD), CC
	/// ccmp A, CA, inv(CD)
	/// ccmp B, CB, inv(CA)
	/// check for CB flags
	///
	/// A counterexample is "or (and A B) (and C D)" which translates to
	/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
	/// can only implement 1 of the inner (not) operations, but not both!
	/// @{

	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
	ISD::CondCode CC, SDValue CCOp,
	AArch64CC::CondCode Predicate,
	AArch64CC::CondCode OutCC,
	const SDLoc &DL, SelectionDAG &DAG) {
	unsigned Opcode = 0;
	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	if (LHS.getValueType().isFloatingPoint()) {
	assert(LHS.getValueType() != MVT::f128);
	if (LHS.getValueType() == MVT::f16 && !FullFP16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
	}
	Opcode = AArch64ISD::FCCMP;
	} else if (RHS.getOpcode() == ISD::SUB) {
	SDValue SubOp0 = RHS.getOperand(0);
	if (isNullConstant(SubOp0) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// See emitComparison() on why we can only do this for SETEQ and SETNE.
	Opcode = AArch64ISD::CCMN;
	RHS = RHS.getOperand(1);
	}
	}
	if (Opcode == 0)
	Opcode = AArch64ISD::CCMP;

	SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
	SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
	return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
	}

	/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
	/// expressed as a conjunction. See \ref AArch64CCMP.
	/// \param CanNegate Set to true if we can negate the whole sub-tree just by
	/// changing the conditions on the SETCC tests.
	/// (this means we can call emitConjunctionRec() with
	/// Negate==true on this sub-tree)
	/// \param MustBeFirst Set to true if this subtree needs to be negated and we
	/// cannot do the negation naturally. We are required to
	/// emit the subtree first in this case.
	/// \param WillNegate Is true if are called when the result of this
	/// subexpression must be negated. This happens when the
	/// outer expression is an OR. We can use this fact to know
	/// that we have a double negation (or (or ...) ...) that
	/// can be implemented for free.
	static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
	bool &MustBeFirst, bool WillNegate,
	unsigned Depth = 0) {
	if (!Val.hasOneUse())
	return false;
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	if (Val->getOperand(0).getValueType() == MVT::f128)
	return false;
	CanNegate = true;
	MustBeFirst = false;
	return true;
	}
	// Protect against exponential runtime and stack overflow.
	if (Depth > 6)
	return false;
	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
	bool IsOR = Opcode == ISD::OR;
	SDValue O0 = Val->getOperand(0);
	SDValue O1 = Val->getOperand(1);
	bool CanNegateL;
	bool MustBeFirstL;
	if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
	return false;
	bool CanNegateR;
	bool MustBeFirstR;
	if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
	return false;

	if (MustBeFirstL && MustBeFirstR)
	return false;

	if (IsOR) {
	// For an OR expression we need to be able to naturally negate at least
	// one side or we cannot do the transformation at all.
	if (!CanNegateL && !CanNegateR)
	return false;
	// If we the result of the OR will be negated and we can naturally negate
	// the leafs, then this sub-tree as a whole negates naturally.
	CanNegate = WillNegate && CanNegateL && CanNegateR;
	// If we cannot naturally negate the whole sub-tree, then this must be
	// emitted first.
	MustBeFirst = !CanNegate;
	} else {
	assert(Opcode == ISD::AND && "Must be OR or AND");
	// We cannot naturally negate an AND operation.
	CanNegate = false;
	MustBeFirst = MustBeFirstL \|\| MustBeFirstR;
	}
	return true;
	}
	return false;
	}

	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
	/// Tries to transform the given i1 producing node @p Val to a series compare
	/// and conditional compare operations. @returns an NZCV flags producing node
	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
	/// transformation was not possible.
	/// \p Negate is true if we want this sub-tree being negated just by changing
	/// SETCC conditions.
	static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
	AArch64CC::CondCode Predicate) {
	// We're at a tree leaf, produce a conditional comparison operation.
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	SDValue LHS = Val->getOperand(0);
	SDValue RHS = Val->getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
	bool isInteger = LHS.getValueType().isInteger();
	if (Negate)
	CC = getSetCCInverse(CC, isInteger);
	SDLoc DL(Val);
	// Determine OutCC and handle FP special case.
	if (isInteger) {
	OutCC = changeIntCCToAArch64CC(CC);
	} else {
	assert(LHS.getValueType().isFloatingPoint());
	AArch64CC::CondCode ExtraCC;
	changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
	// Some floating point conditions can't be tested with a single condition
	// code. Construct an additional comparison in this case.
	if (ExtraCC != AArch64CC::AL) {
	SDValue ExtraCmp;
	if (!CCOp.getNode())
	ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
	else
	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
	ExtraCC, DL, DAG);
	CCOp = ExtraCmp;
	Predicate = ExtraCC;
	}
	}

	// Produce a normal comparison if we are first in the chain
	if (!CCOp)
	return emitComparison(LHS, RHS, CC, DL, DAG);
	// Otherwise produce a ccmp.
	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
	DAG);
	}
	assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");

	bool IsOR = Opcode == ISD::OR;

	SDValue LHS = Val->getOperand(0);
	bool CanNegateL;
	bool MustBeFirstL;
	bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
	assert(ValidL && "Valid conjunction/disjunction tree");
	(void)ValidL;

	SDValue RHS = Val->getOperand(1);
	bool CanNegateR;
	bool MustBeFirstR;
	bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
	assert(ValidR && "Valid conjunction/disjunction tree");
	(void)ValidR;

	// Swap sub-tree that must come first to the right side.
	if (MustBeFirstL) {
	assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
	std::swap(LHS, RHS);
	std::swap(CanNegateL, CanNegateR);
	std::swap(MustBeFirstL, MustBeFirstR);
	}

	bool NegateR;
	bool NegateAfterR;
	bool NegateL;
	bool NegateAfterAll;
	if (Opcode == ISD::OR) {
	// Swap the sub-tree that we can negate naturally to the left.
	if (!CanNegateL) {
	assert(CanNegateR && "at least one side must be negatable");
	assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
	assert(!Negate);
	std::swap(LHS, RHS);
	NegateR = false;
	NegateAfterR = true;
	} else {
	// Negate the left sub-tree if possible, otherwise negate the result.
	NegateR = CanNegateR;
	NegateAfterR = !CanNegateR;
	}
	NegateL = true;
	NegateAfterAll = !Negate;
	} else {
	assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
	assert(!Negate && "Valid conjunction/disjunction tree");

	NegateL = false;
	NegateR = false;
	NegateAfterR = false;
	NegateAfterAll = false;
	}

	// Emit sub-trees.
	AArch64CC::CondCode RHSCC;
	SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
	if (NegateAfterR)
	RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
	SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
	if (NegateAfterAll)
	OutCC = AArch64CC::getInvertedCondCode(OutCC);
	return CmpL;
	}

	/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
	/// In some cases this is even possible with OR operations in the expression.
	/// See \ref AArch64CCMP.
	/// \see emitConjunctionRec().
	static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC) {
	bool DummyCanNegate;
	bool DummyMustBeFirst;
	if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
	return SDValue();

	return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
	}

	/// @}

	/// Returns how profitable it is to fold a comparison's operand's shift and/or
	/// extension operations.
	static unsigned getCmpOperandFoldingProfit(SDValue Op) {
	auto isSupportedExtend = [&](SDValue V) {
	if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
	return true;

	if (V.getOpcode() == ISD::AND)
	if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
	uint64_t Mask = MaskCst->getZExtValue();
	return (Mask == 0xFF \|\| Mask == 0xFFFF \|\| Mask == 0xFFFFFFFF);
	}

	return false;
	};

	if (!Op.hasOneUse())
	return 0;

	if (isSupportedExtend(Op))
	return 1;

	unsigned Opc = Op.getOpcode();
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
	if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	uint64_t Shift = ShiftCst->getZExtValue();
	if (isSupportedExtend(Op.getOperand(0)))
	return (Shift <= 4) ? 2 : 1;
	EVT VT = Op.getValueType();
	if ((VT == MVT::i32 && Shift <= 31) \|\| (VT == MVT::i64 && Shift <= 63))
	return 1;
	}

	return 0;
	}

	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	SDValue &AArch64cc, SelectionDAG &DAG,
	const SDLoc &dl) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
	EVT VT = RHS.getValueType();
	uint64_t C = RHSC->getZExtValue();
	if (!isLegalArithImmed(C)) {
	// Constant does not fit, try adjusting it by one?
	switch (CC) {
	default:
	break;
	case ISD::SETLT:
	case ISD::SETGE:
	if ((VT == MVT::i32 && C != 0x80000000 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0x80000000ULL &&
	isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULT:
	case ISD::SETUGE:
	if ((VT == MVT::i32 && C != 0 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETLE:
	case ISD::SETGT:
	if ((VT == MVT::i32 && C != INT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != INT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULE:
	case ISD::SETUGT:
	if ((VT == MVT::i32 && C != UINT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != UINT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	}
	}
	}

	// Comparisons are canonicalized so that the RHS operand is simpler than the
	// LHS one, the extreme case being when RHS is an immediate. However, AArch64
	// can fold some shift+extend operations on the RHS operand, so swap the
	// operands if that can be done.
	//
	// For example:
	// lsl w13, w11, #1
	// cmp w13, w12
	// can be turned into:
	// cmp w12, w11, lsl #1
	if (!isa<ConstantSDNode>(RHS) \|\|
	!isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
	SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;

	if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}
	}

	SDValue Cmp;
	AArch64CC::CondCode AArch64CC;
	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
	const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);

	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
	// For the i8 operand, the largest immediate is 255, so this can be easily
	// encoded in the compare instruction. For the i16 operand, however, the
	// largest immediate cannot be encoded in the compare.
	// Therefore, use a sign extending load and cmn to avoid materializing the
	// -1 constant. For example,
	// movz w1, #65535
	// ldrh w0, [x0, #0]
	// cmp w0, w1
	// >
	// ldrsh w0, [x0, #0]
	// cmn w0, #1
	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
	// if and only if (sext LHS) == (sext RHS). The checks are in place to
	// ensure both the LHS and RHS are truly zero extended and to make sure the
	// transformation is profitable.
	if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
	cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
	cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
	LHS.getNode()->hasNUsesOfValue(1, 0)) {
	int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
	SDValue SExt =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
	DAG.getValueType(MVT::i16));
	Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
	RHS.getValueType()),
	CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	}

	if (!Cmp && (RHSC->isNullValue() \|\| RHSC->isOne())) {
	if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
	if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
	AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
	}
	}
	}

	if (!Cmp) {
	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
	return Cmp;
	}

	static std::pair<SDValue, SDValue>
	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
	"Unsupported value type");
	SDValue Value, Overflow;
	SDLoc DL(Op);
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned Opc = 0;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Unknown overflow instruction!");
	case ISD::SADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::VS;
	break;
	case ISD::UADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::HS;
	break;
	case ISD::SSUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::VS;
	break;
	case ISD::USUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::LO;
	break;
	// Multiply needs a little bit extra work.
	case ISD::SMULO:
	case ISD::UMULO: {
	CC = AArch64CC::NE;
	bool IsSigned = Op.getOpcode() == ISD::SMULO;
	if (Op.getValueType() == MVT::i32) {
	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	// For a 32 bit multiply with overflow check we want the instruction
	// selector to generate a widening multiply (SMADDL/UMADDL). For that we
	// need to generate the following pattern:
	// (i64 add 0, (i64 mul (i64 sext\|zext i32 %a), (i64 sext\|zext i32 %b))
	LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
	RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
	SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
	DAG.getConstant(0, DL, MVT::i64));
	// On AArch64 the upper 32 bits are always zero extended for a 32 bit
	// operation. We need to clear out the upper 32 bits, because we used a
	// widening multiply that wrote all 64 bits. In the end this should be a
	// noop.
	Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
	if (IsSigned) {
	// The signed overflow check requires more than just a simple check for
	// any bit set in the upper 32 bits of the result. These bits could be
	// just the sign bits of a negative number. To perform the overflow
	// check we have to arithmetic shift right the 32nd bit of the result by
	// 31 bits. Then we compare the result to the upper 32 bits.
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
	DAG.getConstant(32, DL, MVT::i64));
	UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
	DAG.getConstant(31, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	// The overflow check for unsigned multiply is easy. We only need to
	// check if any of the upper 32 bits are set. This can be done with a
	// CMP (shifted register). For that we need to generate the following
	// pattern:
	// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
	DAG.getConstant(32, DL, MVT::i64));
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
	// For the 64 bit multiply
	Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	if (IsSigned) {
	SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
	DAG.getConstant(63, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	} // switch (...)

	if (Opc) {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

	// Emit the AArch64 operation with overflow check.
	Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}
	return std::make_pair(Value, Overflow);
	}

	SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {
	SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
	return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
	}

	// Returns true if the given Op is the overflow flag result of an overflow
	// intrinsic operation.
	static bool isOverflowIntrOpRes(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	return (Op.getResNo() == 1 &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO \|\| Opc == ISD::SMULO \|\| Opc == ISD::UMULO));
	}

	static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
	SDValue Sel = Op.getOperand(0);
	SDValue Other = Op.getOperand(1);
	SDLoc dl(Sel);

	// If the operand is an overflow checking operation, invert the condition
	// code and kill the Not operation. I.e., transform:
	// (xor (overflow_op_bool, 1))
	// -->
	// (csel 1, 0, invert(cc), overflow_op_bool)
	// ... which later gets transformed to just a cset instruction with an
	// inverted condition code, rather than a cset + eor sequence.
	if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
	return SDValue();

	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
	AArch64CC::CondCode CC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}
	// If neither operand is a SELECT_CC, give up.
	if (Sel.getOpcode() != ISD::SELECT_CC)
	std::swap(Sel, Other);
	if (Sel.getOpcode() != ISD::SELECT_CC)
	return Op;

	// The folding we want to perform is:
	// (xor x, (select_cc a, b, cc, 0, -1) )
	// -->
	// (csel x, (xor x, -1), cc ...)
	//
	// The latter will get matched to a CSINV instruction.

	ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
	SDValue LHS = Sel.getOperand(0);
	SDValue RHS = Sel.getOperand(1);
	SDValue TVal = Sel.getOperand(2);
	SDValue FVal = Sel.getOperand(3);

	// FIXME: This could be generalized to non-integer comparisons.
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return Op;

	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	// The values aren't constants, this isn't the pattern we're looking for.
	if (!CFVal \|\| !CTVal)
	return Op;

	// We can commute the SELECT_CC by inverting the condition. This
	// might be needed to make this fit into a CSINV pattern.
	if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}

	// If the constants line up, perform the transform!
	if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);

	FVal = Other;
	TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
	DAG.getConstant(-1ULL, dl, Other.getValueType()));

	return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
	CCVal, Cmp);
	}

	return Op;
	}

	static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	unsigned Opc;
	bool ExtraOp = false;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Invalid code");
	case ISD::ADDC:
	Opc = AArch64ISD::ADDS;
	break;
	case ISD::SUBC:
	Opc = AArch64ISD::SUBS;
	break;
	case ISD::ADDE:
	Opc = AArch64ISD::ADCS;
	ExtraOp = true;
	break;
	case ISD::SUBE:
	Opc = AArch64ISD::SBCS;
	ExtraOp = true;
	break;
	}

	if (!ExtraOp)
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
	return SDValue();

	SDLoc dl(Op);
	AArch64CC::CondCode CC;
	// The actual operation that sets the overflow or carry flag.
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);

	// We use 0 and 1 as false and true values.
	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);

	// We use an inverted condition, because the conditional select is inverted
	// too. This will allow it to be selected to a single instruction:
	// CSINC Wd, WZR, WZR, invert(cond).
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
	CCVal, Overflow);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
	}

	// Prefetch operands are:
	// 1: Address to prefetch
	// 2: bool isWrite
	// 3: int locality (0 = no locality ... 3 = extreme locality)
	// 4: bool isDataCache
	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();

	bool IsStream = !Locality;
	// When the locality number is set
	if (Locality) {
	// The front-end should have filtered out the out-of-range values
	assert(Locality <= 3 && "Prefetch locality out-of-range");
	// The locality degree is the opposite of the cache speed.
	// Put the number the other way around.
	// The encoding starts at 0 for level 1
	Locality = 3 - Locality;
	}

	// built the mask value encoding the expected behavior.
	unsigned PrfOp = (IsWrite << 4) \| // Load/Store bit
	(!IsData << 3) \| // IsDataCache bit
	(Locality << 1) \| // Cache level bits
	(unsigned)IsStream; // Stream bit
	return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
	DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");

	RTLIB::Libcall LC;
	LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getOperand(0).getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.
	SDValue SrcVal = Op.getOperand(0);
	return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /isSigned/ false,
	SDLoc(Op)).first;
	}

	SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT InVT = Op.getOperand(0).getValueType();
	EVT VT = Op.getValueType();
	unsigned NumElts = InVT.getVectorNumElements();

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (InVT.getVectorElementType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
	}

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	SDLoc dl(Op);
	SDValue Cv =
	DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
	Op.getOperand(0));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	SDLoc dl(Op);
	MVT ExtVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
	return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
	}

	// Type changing conversions are illegal.
	return Op;
	}

	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getOperand(0).getValueType().isVector())
	return LowerVectorFP_TO_INT(Op, DAG);

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (Op.getOperand(0).getValueType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
	}

	if (Op.getOperand(0).getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::FP_TO_SINT)
	LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
	else
	LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());

	SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
	return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
	}

	static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue In = Op.getOperand(0);
	EVT InVT = In.getValueType();

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	MVT CastVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
	InVT.getVectorNumElements());
	In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
	return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	unsigned CastOpc =
	Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	EVT CastVT = VT.changeVectorElementTypeToInteger();
	In = DAG.getNode(CastOpc, dl, CastVT, In);
	return DAG.getNode(Op.getOpcode(), dl, VT, In);
	}

	return Op;
	}

	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorINT_TO_FP(Op, DAG);

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (Op.getValueType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	SDLoc dl(Op);
	return DAG.getNode(
	ISD::FP_ROUND, dl, MVT::f16,
	DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
	DAG.getIntPtrConstant(0, dl));
	}

	// i128 conversions are libcalls.
	if (Op.getOperand(0).getValueType() == MVT::i128)
	return SDValue();

	// Other conversions are legal, unless it's to the completely software-based
	// fp128.
	if (Op.getValueType() != MVT::f128)
	return Op;

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::SINT_TO_FP)
	LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
	else
	LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
	SelectionDAG &DAG) const {
	// For iOS, we want to call an alternative entry point: __sincos_stret,
	// which returns the values in two S / D registers.
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	ArgListTy Args;
	ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
	: RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));

	StructType *RetTy = StructType::get(ArgTy, ArgTy);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.first;
	}

	static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
	if (Op.getValueType() != MVT::f16)
	return SDValue();

	assert(Op.getOperand(0).getValueType() == MVT::i16);
	SDLoc DL(Op);

	Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
	return SDValue(
	DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	}

	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
	if (OrigVT.getSizeInBits() >= 64)
	return OrigVT;

	assert(OrigVT.isSimple() && "Expecting a simple value type");

	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
	switch (OrigSimpleTy) {
	default: llvm_unreachable("Unexpected Vector Type");
	case MVT::v2i8:
	case MVT::v2i16:
	return MVT::v2i32;
	case MVT::v4i8:
	return MVT::v4i16;
	}
	}

	static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
	const EVT &OrigTy,
	const EVT &ExtTy,
	unsigned ExtOpcode) {
	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
	// 64-bits we need to insert a new extension so that it will be 64-bits.
	assert(ExtTy.is128BitVector() && "Unexpected extension size");
	if (OrigTy.getSizeInBits() >= 64)
	return N;

	// Must extend size to at least 64 bits to be used as an operand for VMULL.
	EVT NewVT = getExtensionTo64Bits(OrigTy);

	return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
	}

	static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
	bool isSigned) {
	EVT VT = N->getValueType(0);

	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Elt : N->op_values()) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned HalfSize = EltSize / 2;
	if (isSigned) {
	if (!isIntN(HalfSize, C->getSExtValue()))
	return false;
	} else {
	if (!isUIntN(HalfSize, C->getZExtValue()))
	return false;
	}
	continue;
	}
	return false;
	}

	return true;
	}

	static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND)
	return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
	N->getOperand(0)->getValueType(0),
	N->getValueType(0),
	N->getOpcode());

	assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	unsigned EltSize = VT.getScalarSizeInBits() / 2;
	unsigned NumElts = VT.getVectorNumElements();
	MVT TruncVT = MVT::getIntegerVT(EltSize);
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i != NumElts; ++i) {
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
	const APInt &CInt = C->getAPIntValue();
	// Element types smaller than 32 bits are not legal, so use i32 elements.
	// The values are implicitly truncated so sext vs. zext doesn't matter.
	Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
	}
	return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
	}

	static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::SIGN_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, true);
	}

	static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::ZERO_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, false);
	}

	static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
	}
	return false;
	}

	static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
	}
	return false;
	}

	SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	// The rounding mode is in bits 23:22 of the FPSCR.
	// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
	// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
	// so that the shift + and get folded into a bitfield extract.
	SDLoc dl(Op);

	SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
	DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
	MVT::i64));
	SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
	SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
	DAG.getConstant(1U << 22, dl, MVT::i32));
	SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
	DAG.getConstant(22, dl, MVT::i32));
	return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
	DAG.getConstant(3, dl, MVT::i32));
	}

	static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
	// Multiplications are only custom-lowered for 128-bit vectors so that
	// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
	EVT VT = Op.getValueType();
	assert(VT.is128BitVector() && VT.isInteger() &&
	"unexpected type for custom-lowering ISD::MUL");
	SDNode *N0 = Op.getOperand(0).getNode();
	SDNode *N1 = Op.getOperand(1).getNode();
	unsigned NewOpc = 0;
	bool isMLA = false;
	bool isN0SExt = isSignExtended(N0, DAG);
	bool isN1SExt = isSignExtended(N1, DAG);
	if (isN0SExt && isN1SExt)
	NewOpc = AArch64ISD::SMULL;
	else {
	bool isN0ZExt = isZeroExtended(N0, DAG);
	bool isN1ZExt = isZeroExtended(N1, DAG);
	if (isN0ZExt && isN1ZExt)
	NewOpc = AArch64ISD::UMULL;
	else if (isN1SExt \|\| isN1ZExt) {
	// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
	// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
	if (isN1SExt && isAddSubSExt(N0, DAG)) {
	NewOpc = AArch64ISD::SMULL;
	isMLA = true;
	} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
	std::swap(N0, N1);
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	}
	}

	if (!NewOpc) {
	if (VT == MVT::v2i64)
	// Fall through to expand this. It is not legal.
	return SDValue();
	else
	// Other vector multiplications are legal.
	return Op;
	}
	}

	// Legalize to a S/UMULL instruction
	SDLoc DL(Op);
	SDValue Op0;
	SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
	if (!isMLA) {
	Op0 = skipExtensionForVectorMULL(N0, DAG);
	assert(Op0.getValueType().is64BitVector() &&
	Op1.getValueType().is64BitVector() &&
	"unexpected types for extended operands to VMULL");
	return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
	}
	// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
	SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
	SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
	EVT Op1VT = Op1.getValueType();
	return DAG.getNode(N0->getOpcode(), DL, VT,
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
	}

	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
	}
	case Intrinsic::aarch64_neon_abs: {
	EVT Ty = Op.getValueType();
	if (Ty == MVT::i64) {
	SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
	Op.getOperand(1));
	Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
	return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
	} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
	return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
	} else {
	report_fatal_error("Unexpected type for AArch64 NEON intrinic");
	}
	}
	case Intrinsic::aarch64_neon_smax:
	return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umax:
	return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_smin:
	return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umin:
	return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));

	case Intrinsic::localaddress: {
	const auto &MF = DAG.getMachineFunction();
	const auto *RegInfo = Subtarget->getRegisterInfo();
	unsigned Reg = RegInfo->getLocalAddressRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
	Op.getSimpleValueType());
	}

	case Intrinsic::eh_recoverfp: {
	// FIXME: This needs to be implemented to correctly handle highly aligned
	// stack objects. For now we simply return the incoming FP. Refer D53541
	// for more details.
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return IncomingFPOp;
	}
	}
	}

	// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
	static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
	EVT VT, EVT MemVT,
	SelectionDAG &DAG) {
	assert(VT.isVector() && "VT should be a vector type");
	assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);

	SDValue Value = ST->getValue();

	// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
	// the word lane which represent the v4i8 subvector. It optimizes the store
	// to:
	//
	// xtn v0.8b, v0.8h
	// str s0, [x0]

	SDValue Undef = DAG.getUNDEF(MVT::i16);
	SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
	{Undef, Undef, Undef, Undef});

	SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
	Value, UndefVec);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);

	Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
	SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
	Trunc, DAG.getConstant(0, DL, MVT::i64));

	return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
	ST->getBasePtr(), ST->getMemOperand());
	}

	// Custom lowering for any store, vector or scalar and/or default or with
	// a truncate operations. Currently only custom lower truncate operation
	// from vector v4i16 to v4i8.
	SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc Dl(Op);
	StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
	assert (StoreNode && "Can only custom lower store nodes");

	SDValue Value = StoreNode->getValue();

	EVT VT = Value.getValueType();
	EVT MemVT = StoreNode->getMemoryVT();

	assert (VT.isVector() && "Can only custom lower vector store types");

	unsigned AS = StoreNode->getAddressSpace();
	unsigned Align = StoreNode->getAlignment();
	if (Align < MemVT.getStoreSize() &&
	!allowsMisalignedMemoryAccesses(
	MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
	return scalarizeVectorStore(StoreNode, DAG);
	}

	if (StoreNode->isTruncatingStore()) {
	return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	LLVM_DEBUG(dbgs() << "Custom lowering: ");
	LLVM_DEBUG(Op.dump());

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unimplemented operand");
	return SDValue();
	case ISD::BITCAST:
	return LowerBITCAST(Op, DAG);
	case ISD::GlobalAddress:
	return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress:
	return LowerGlobalTLSAddress(Op, DAG);
	case ISD::SETCC:
	return LowerSETCC(Op, DAG);
	case ISD::BR_CC:
	return LowerBR_CC(Op, DAG);
	case ISD::SELECT:
	return LowerSELECT(Op, DAG);
	case ISD::SELECT_CC:
	return LowerSELECT_CC(Op, DAG);
	case ISD::JumpTable:
	return LowerJumpTable(Op, DAG);
	case ISD::BR_JT:
	return LowerBR_JT(Op, DAG);
	case ISD::ConstantPool:
	return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress:
	return LowerBlockAddress(Op, DAG);
	case ISD::VASTART:
	return LowerVASTART(Op, DAG);
	case ISD::VACOPY:
	return LowerVACOPY(Op, DAG);
	case ISD::VAARG:
	return LowerVAARG(Op, DAG);
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUBC:
	case ISD::SUBE:
	return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	return LowerXALUO(Op, DAG);
	case ISD::FADD:
	return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
	case ISD::FSUB:
	return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
	case ISD::FMUL:
	return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
	case ISD::FDIV:
	return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
	case ISD::FP_ROUND:
	return LowerFP_ROUND(Op, DAG);
	case ISD::FP_EXTEND:
	return LowerFP_EXTEND(Op, DAG);
	case ISD::FRAMEADDR:
	return LowerFRAMEADDR(Op, DAG);
	case ISD::SPONENTRY:
	return LowerSPONENTRY(Op, DAG);
	case ISD::RETURNADDR:
	return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR:
	return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::INSERT_VECTOR_ELT:
	return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::BUILD_VECTOR:
	return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE:
	return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR:
	return LowerEXTRACT_SUBVECTOR(Op, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL:
	return LowerVectorSRA_SRL_SHL(Op, DAG);
	case ISD::SHL_PARTS:
	return LowerShiftLeftParts(Op, DAG);
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS:
	return LowerShiftRightParts(Op, DAG);
	case ISD::CTPOP:
	return LowerCTPOP(Op, DAG);
	case ISD::FCOPYSIGN:
	return LowerFCOPYSIGN(Op, DAG);
	case ISD::OR:
	return LowerVectorOR(Op, DAG);
	case ISD::XOR:
	return LowerXOR(Op, DAG);
	case ISD::PREFETCH:
	return LowerPREFETCH(Op, DAG);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return LowerINT_TO_FP(Op, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return LowerFP_TO_INT(Op, DAG);
	case ISD::FSINCOS:
	return LowerFSINCOS(Op, DAG);
	case ISD::FLT_ROUNDS_:
	return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::MUL:
	return LowerMUL(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN:
	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::STORE:
	return LowerSTORE(Op, DAG);
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	return LowerVECREDUCE(Op, DAG);
	case ISD::ATOMIC_LOAD_SUB:
	return LowerATOMIC_LOAD_SUB(Op, DAG);
	case ISD::ATOMIC_LOAD_AND:
	return LowerATOMIC_LOAD_AND(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC:
	return LowerDYNAMIC_STACKALLOC(Op, DAG);
	}
	}

	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool IsVarArg) const {
	switch (CC) {
	default:
	report_fatal_error("Unsupported calling convention.");
	case CallingConv::WebKit_JS:
	return CC_AArch64_WebKit_JS;
	case CallingConv::GHC:
	return CC_AArch64_GHC;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::PreserveMost:
	case CallingConv::CXX_FAST_TLS:
	case CallingConv::Swift:
	if (Subtarget->isTargetWindows() && IsVarArg)
	return CC_AArch64_Win64_VarArg;
	if (!Subtarget->isTargetDarwin())
	return CC_AArch64_AAPCS;
	return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
	case CallingConv::Win64:
	return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
	case CallingConv::AArch64_VectorCall:
	return CC_AArch64_AAPCS;
	}
	}

	CCAssignFn *
	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
	return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	}

	SDValue AArch64TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// At this point, Ins[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Ins.size();
	Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Ins[i].VT;
	if (Ins[i].isOrigArg()) {
	std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[i].getOrigArgIndex();

	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;
	}
	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res =
	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	assert(ArgLocs.size() == Ins.size());
	SmallVector<SDValue, 16> ArgValues;
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	if (Ins[i].Flags.isByVal()) {
	// Byval is used for HFAs in the PCS, but the system should work in a
	// non-compliant manner for larger structs.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	int Size = Ins[i].Flags.getByValSize();
	unsigned NumRegs = (Size + 7) / 8;

	// FIXME: This works on big-endian for composite byvals, which are the common
	// case. It should also work for fundamental types too.
	unsigned FrameIdx =
	MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
	SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
	InVals.push_back(FrameIdxN);

	continue;
	}

	if (VA.isRegLoc()) {
	// Arguments stored in registers.
	EVT RegVT = VA.getLocVT();

	SDValue ArgValue;
	const TargetRegisterClass *RC;

	if (RegVT == MVT::i32)
	RC = &AArch64::GPR32RegClass;
	else if (RegVT == MVT::i64)
	RC = &AArch64::GPR64RegClass;
	else if (RegVT == MVT::f16)
	RC = &AArch64::FPR16RegClass;
	else if (RegVT == MVT::f32)
	RC = &AArch64::FPR32RegClass;
	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
	RC = &AArch64::FPR64RegClass;
	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
	RC = &AArch64::FPR128RegClass;
	else
	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");

	// Transform the arguments in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);

	// If this is an 8, 16 or 32-bit value, it is really passed promoted
	// to 64 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::AExt:
	case CCValAssign::SExt:
	case CCValAssign::ZExt:
	// SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
	// nodes after our lowering.
	assert(RegVT == Ins[i].VT && "incorrect register location selected");
	break;
	}

	InVals.push_back(ArgValue);

	} else { // VA.isRegLoc()
	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
	unsigned ArgOffset = VA.getLocMemOffset();
	unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;

	uint32_t BEAlign = 0;
	if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
	!Ins[i].Flags.isInConsecutiveRegs())
	BEAlign = 8 - ArgSize;

	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue ArgValue;

	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	MVT MemVT = VA.getValVT();

	switch (VA.getLocInfo()) {
	default:
	break;
	case CCValAssign::BCvt:
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::SExt:
	ExtType = ISD::SEXTLOAD;
	break;
	case CCValAssign::ZExt:
	ExtType = ISD::ZEXTLOAD;
	break;
	case CCValAssign::AExt:
	ExtType = ISD::EXTLOAD;
	break;
	}

	ArgValue = DAG.getExtLoad(
	ExtType, DL, VA.getLocVT(), Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
	MemVT);

	InVals.push_back(ArgValue);
	}
	}

	// varargs
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	if (isVarArg) {
	if (!Subtarget->isTargetDarwin() \|\| IsWin64) {
	// The AAPCS variadic function ABI is identical to the non-variadic
	// one. As a result there may be more arguments in registers and we should
	// save them for future reference.
	// Win64 variadic functions also pass arguments in registers, but all float
	// arguments are passed in integer registers.
	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
	}

	// This will point to the next argument passed via stack.
	unsigned StackOffset = CCInfo.getNextStackOffset();
	// We currently pass all varargs at 8-byte alignment.
	StackOffset = ((StackOffset + 7) & ~7);
	FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));

	if (MFI.hasMustTailInVarArgFunc()) {
	SmallVector<MVT, 2> RegParmTypes;
	RegParmTypes.push_back(MVT::i64);
	RegParmTypes.push_back(MVT::f128);
	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
	CC_AArch64_AAPCS);

	// Conservatively forward X8, since it might be used for aggregate return.
	if (!CCInfo.isAllocated(AArch64::X8)) {
	unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
	Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
	}
	}
	}

	// On Windows, InReg pointers must be returned, so record the pointer in a
	// virtual register at the start of the function so it can be returned in the
	// epilogue.
	if (IsWin64) {
	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	if (Ins[I].Flags.isInReg()) {
	assert(!FuncInfo->getSRetReturnReg());

	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned Reg =
	MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);

	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
	break;
	}
	}
	}

	unsigned StackArgSize = CCInfo.getNextStackOffset();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
	// This is a non-standard ABI so by fiat I say we're allowed to make full
	// use of the stack area to be popped, which must be aligned to 16 bytes in
	// any case:
	StackArgSize = alignTo(StackArgSize, 16);

	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
	// a multiple of 16.
	FuncInfo->setArgumentStackToRestore(StackArgSize);

	// This realignment carries over to the available bytes below. Our own
	// callers will guarantee the space is free by giving an aligned value to
	// CALLSEQ_START.
	}
	// Even if we're not expected to free up the space, it's useful to know how
	// much is there while considering tail calls (because we can reuse it).
	FuncInfo->setBytesInStackArgArea(StackArgSize);

	if (Subtarget->hasCustomCallingConv())
	Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);

	return Chain;
	}

	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
	SelectionDAG &DAG,
	const SDLoc &DL,
	SDValue &Chain) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());

	SmallVector<SDValue, 8> MemOps;

	static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
	AArch64::X3, AArch64::X4, AArch64::X5,
	AArch64::X6, AArch64::X7 };
	static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);

	unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
	int GPRIdx = 0;
	if (GPRSaveSize != 0) {
	if (IsWin64) {
	GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
	if (GPRSaveSize & 15)
	// The extra size here, if triggered, will always be 8.
	MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
	} else
	GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);

	SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);

	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	IsWin64
	? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
	GPRIdx,
	(i - FirstVariadicGPR) * 8)
	: MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
	MemOps.push_back(Store);
	FIN =
	DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsGPRIndex(GPRIdx);
	FuncInfo->setVarArgsGPRSize(GPRSaveSize);

	if (Subtarget->hasFPARMv8() && !IsWin64) {
	static const MCPhysReg FPRArgRegs[] = {
	AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
	AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
	static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);

	unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
	int FPRIdx = 0;
	if (FPRSaveSize != 0) {
	FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);

	SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);

	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);

	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
	MemOps.push_back(Store);
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
	DAG.getConstant(16, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsFPRIndex(FPRIdx);
	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
	}

	if (!MemOps.empty()) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}
	}

	/// LowerCallResult - Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	SDValue AArch64TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
	SDValue ThisVal) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign VA = RVLocs[i];

	// Pass 'this' value directly from the argument to return value, to avoid
	// reg unit interference
	if (i == 0 && isThisReturn) {
	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
	"unexpected return calling convention register assignment");
	InVals.push_back(ThisVal);
	continue;
	}

	SDValue Val =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return CC == CallingConv::Fast;
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::C:
	case CallingConv::PreserveMost:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();
	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible (see
	// X86) but less efficient and uglier in LowerCall.
	for (Function::const_arg_iterator i = CallerF.arg_begin(),
	e = CallerF.arg_end();
	i != e; ++i) {
	if (i->hasByValAttr())
	return false;

	// On Windows, "inreg" attributes signify non-aggregate indirect returns.
	// In this case, it is necessary to save/restore X0 in the callee. Tail
	// call opt interferes with this. So we disable tail call opt when the
	// caller has an argument with "inreg" attribute.

	// FIXME: Check whether the callee also has an "inreg" argument.
	if (i->hasInRegAttr())
	return false;
	}

	if (getTargetMachine().Options.GuaranteedTailCallOpt)
	return canGuaranteeTCO(CalleeCC) && CCMatch;

	// Externally-defined functions with weak linkage should not be
	// tail-called on AArch64 when the OS does not support dynamic
	// pre-emption of symbols, as the AAELF spec requires normal calls
	// to undefined weak functions to be replaced with a NOP or jump to the
	// next instruction. The behaviour of branch instructions in this
	// situation (as used for tail calls) is implementation-defined, so we
	// cannot rely on the linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	const Triple &TT = getTargetMachine().getTargetTriple();
	if (GV->hasExternalWeakLinkage() &&
	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
	return false;
	}

	// Now we search for cases where we can use a tail call without changing the
	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
	// concept.

	// I want anyone implementing a new calling convention to think long and hard
	// about this assert.
	assert((!isVarArg \|\| CalleeCC == CallingConv::C) &&
	"Unexpected variadic calling convention");

	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// At least two cases here: if caller is fastcc then we can't have any
	// memory arguments (we'd be expected to clean up the stack afterwards). If
	// caller is C then we could potentially use its argument area.

	// FIXME: for now we take the most conservative of these in both cases:
	// disallow all variadic memory operands.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
	for (const CCValAssign &ArgLoc : ArgLocs)
	if (!ArgLoc.isRegLoc())
	return false;
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	CCAssignFnForCall(CalleeCC, isVarArg),
	CCAssignFnForCall(CallerCC, isVarArg)))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (Subtarget->hasCustomCallingConv()) {
	TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
	TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
	}
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Nothing more to check if the callee is taking no arguments
	if (Outs.empty())
	return true;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));

	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	// If the stack arguments for this call do not fit into our own save area then
	// the call cannot be made tail.
	if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
	return false;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;

	return true;
	}

	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
	SelectionDAG &DAG,
	MachineFrameInfo &MFI,
	int ClobberedFI) const {
	SmallVector<SDValue, 8> ArgChains;
	int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
	int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument corresponding
	for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
	UE = DAG.getEntryNode().getNode()->use_end();
	U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0) {
	int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
	int64_t InLastByte = InFirstByte;
	InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;

	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
	ArgChains.push_back(SDValue(L, 1));
	}

	// Build a tokenfactor for all the chains.
	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
	bool TailCallOpt) const {
	return CallCC == CallingConv::Fast && TailCallOpt;
	}

	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
	/// and add input and output parameter nodes.
	SDValue
	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
	SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
	SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool IsThisReturn = false;

	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	bool IsSibCall = false;

	if (IsTailCall) {
	// Check if it's really possible to do a tail call.
	IsTailCall = isEligibleForTailCallOptimization(
	Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
	if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// A sibling call is one where we're under the usual C ABI and not planning
	// to change that but can still do a tail call:
	if (!TailCallOpt && IsTailCall)
	IsSibCall = true;

	if (IsTailCall)
	++NumTailCalls;
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	if (IsVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
	/IsVarArg=/ !Outs[i].IsFixed);
	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	} else {
	// At this point, Outs[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeCallOperands to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Outs.size();
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Outs[i].VT;
	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(),
	CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;

	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	if (IsSibCall) {
	// Since we're not changing the ABI to make this a tail call, the memory
	// operands are already available in the caller's incoming argument space.
	NumBytes = 0;
	}

	// FPDiff is the byte offset of the call's argument area from the callee's.
	// Stores to callee stack arguments will be placed in FixedStackSlots offset
	// by this amount for a tail call. In a sibling call it must be 0 because the
	// caller will deallocate the entire stack and the callee still expects its
	// arguments to begin at SP+0. Completely unused for non-tail calls.
	int FPDiff = 0;

	if (IsTailCall && !IsSibCall) {
	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();

	// Since callee will pop argument stack as a tail call, we must keep the
	// popped size 16-byte aligned.
	NumBytes = alignTo(NumBytes, 16);

	// FPDiff will be negative if this tail call requires more space than we
	// would automatically have in our incoming argument space. Positive if we
	// can actually shrink the stack.
	FPDiff = NumReusableBytes - NumBytes;

	// The stack pointer must be 16-byte aligned at all times it's used for a
	// memory operation, which in practice means at all times and in
	// particular across call boundaries. Therefore our own arguments started at
	// a 16-byte aligned SP and the delta applied for the tail call should
	// satisfy the same constraint.
	assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
	}

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);

	SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
	getPointerTy(DAG.getDataLayout()));

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
	const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
	++i, ++realArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[realArgIdx];
	ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	if (Outs[realArgIdx].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
	}
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::FPExt:
	Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	}

	if (VA.isRegLoc()) {
	if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
	Outs[0].VT == MVT::i64) {
	assert(VA.getLocVT() == MVT::i64 &&
	"unexpected calling convention register assignment");
	assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
	"unexpected use of 'returned'");
	IsThisReturn = true;
	}
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else {
	assert(VA.isMemLoc());

	SDValue DstAddr;
	MachinePointerInfo DstInfo;

	// FIXME: This works on big-endian for composite byvals, which are the
	// common case. It should also work for fundamental types too.
	uint32_t BEAlign = 0;
	unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
	: VA.getValVT().getSizeInBits();
	OpSize = (OpSize + 7) / 8;
	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
	!Flags.isInConsecutiveRegs()) {
	if (OpSize < 8)
	BEAlign = 8 - OpSize;
	}
	unsigned LocMemOffset = VA.getLocMemOffset();
	int32_t Offset = LocMemOffset + BEAlign;
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
	PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);

	if (IsTailCall) {
	Offset = Offset + FPDiff;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

	DstAddr = DAG.getFrameIndex(FI, PtrVT);
	DstInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Make sure any stack arguments overlapping with where we're storing
	// are loaded before this eventual operation. Otherwise they'll be
	// clobbered.
	Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
	} else {
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);

	DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
	DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
	LocMemOffset);
	}

	if (Outs[i].Flags.isByVal()) {
	SDValue SizeNode =
	DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
	SDValue Cpy = DAG.getMemcpy(
	Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
	/isVol = / false, /AlwaysInline = / false,
	/isTailCall = / false,
	DstInfo, MachinePointerInfo());

	MemOpChains.push_back(Cpy);
	} else {
	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
	// promoted to a legal register type i32, we should truncate Arg back to
	// i1/i8/i16.
	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
	VA.getValVT() == MVT::i16)
	Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

	SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
	MemOpChains.push_back(Store);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto &RegToPass : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
	RegToPass.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.
	if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	auto GV = G->getGlobal();
	if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
	AArch64II::MO_GOT) {
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
	assert(Subtarget->isTargetWindows() &&
	"Windows is the only supported COFF target");
	Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
	} else {
	const GlobalValue *GV = G->getGlobal();
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
	}
	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	Subtarget->isTargetMachO()) {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
	}
	}

	// We don't usually want to end the call-sequence here because we would tidy
	// the frame up after the call, however in the ABI-changing tail-call case
	// we've carefully laid out the parameters so that when sp is reset they'll be
	// in the correct location.
	if (IsTailCall && !IsSibCall) {
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
	InFlag = Chain.getValue(1);
	}

	std::vector<SDValue> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (IsTailCall) {
	// Each tail call may have to adjust the stack by a different amount, so
	// this information must travel along with the operation for eventual
	// consumption by emitEpilogue.
	Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
	}

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (auto &RegToPass : RegsToPass)
	Ops.push_back(DAG.getRegister(RegToPass.first,
	RegToPass.second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	const uint32_t *Mask;
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	if (IsThisReturn) {
	// For 'this' returns, use the X0-preserving mask if applicable
	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
	if (!Mask) {
	IsThisReturn = false;
	Mask = TRI->getCallPreservedMask(MF, CallConv);
	}
	} else
	Mask = TRI->getCallPreservedMask(MF, CallConv);

	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(MF, &Mask);

	if (TRI->isAnyArgRegReserved(MF))
	TRI->emitReservedArgRegCallError(MF);

	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// If we're doing a tall call, use a TC_RETURN here rather than an
	// actual call instruction.
	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
	}

	// Returns a chain and a flag for retval copy to use.
	Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	uint64_t CalleePopBytes =
	DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(CalleePopBytes, DL, true),
	InFlag, DL);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
	InVals, IsThisReturn,
	IsThisReturn ? OutVals[0] : SDValue());
	}

	bool AArch64TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC);
	}

	SDValue
	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	auto &MF = DAG.getMachineFunction();
	auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC);

	// Copy the result values into the output registers.
	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);
	for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	SDValue Arg = OutVals[realRVLocIdx];

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	if (Outs[i].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
	// value. This is strictly redundant on Darwin (which uses "zeroext
	// i1"), but will be optimised out before ISel.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	}
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	}

	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	// Windows AArch64 ABIs require that for returning structs by value we copy
	// the sret argument into X0 for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into X0.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg = AArch64::X0;
	Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
	}

	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (AArch64::GPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (AArch64::FPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
	}

	// (loadGOT sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT \| Flags);
	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into two nodes instead of using a wrapper node.
	return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
	}

	// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, Ty,
	getTargetNode(N, Ty, DAG, AArch64II::MO_G3 \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G2 \| MO_NC \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G1 \| MO_NC \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G0 \| MO_NC \| Flags));
	}

	// (addlow (adrp %hi(sym)) %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE \| Flags);
	SDValue Lo = getTargetNode(N, Ty, DAG,
	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC \| Flags);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
	}

	// (adr sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
	return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
	}

	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GN->getGlobal();
	unsigned char OpFlags =
	Subtarget->ClassifyGlobalReference(GV, getTargetMachine());

	if (OpFlags != AArch64II::MO_NO_FLAG)
	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
	"unexpected offset in global node");

	// This also catches the large code model case for Darwin, and tiny code
	// model with got relocations.
	if ((OpFlags & AArch64II::MO_GOT) != 0) {
	return getGOT(GN, DAG, OpFlags);
	}

	SDValue Result;
	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	Result = getAddrLarge(GN, DAG, OpFlags);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	Result = getAddrTiny(GN, DAG, OpFlags);
	} else {
	Result = getAddr(GN, DAG, OpFlags);
	}
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(GN);
	if (OpFlags & (AArch64II::MO_DLLIMPORT \| AArch64II::MO_COFFSTUB))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	return Result;
	}

	/// Convert a TLS address reference into the correct sequence of loads
	/// and calls to compute the variable's address (for Darwin, currently) and
	/// return an SDValue containing the final node.

	/// Darwin only has one TLS scheme which must be capable of dealing with the
	/// fully general situation, in the worst case. This means:
	/// + "extern __thread" declaration.
	/// + Defined in a possibly unknown dynamic library.
	///
	/// The general system is that each __thread variable has a [3 x i64] descriptor
	/// which contains information used by the runtime to calculate the address. The
	/// only part of this the compiler needs to know about is the first xword, which
	/// contains a function pointer that must be called with the address of the
	/// entire descriptor in "x0".
	///
	/// Since this descriptor may be in a different unit, in general even the
	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
	/// is:
	/// adrp x0, _var@TLVPPAGE
	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
	/// ; the function pointer
	/// blr x1 ; Uses descriptor address in x0
	/// ; Address of _var is now in x0.
	///
	/// If the address of _var's descriptor is known to the linker, then it can
	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
	/// a slight efficiency gain.
	SDValue
	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"This function expects a Darwin target");

	SDLoc DL(Op);
	MVT PtrVT = getPointerTy(DAG.getDataLayout());
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

	SDValue TLVPAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);

	// The first entry in the descriptor is a function pointer that we must call
	// to obtain the address of the variable.
	SDValue Chain = DAG.getEntryNode();
	SDValue FuncTLVGet = DAG.getLoad(
	MVT::i64, DL, Chain, DescAddr,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()),
	/* Alignment = */ 8,
	MachineMemOperand::MONonTemporal \| MachineMemOperand::MOInvariant \|
	MachineMemOperand::MODereferenceable);
	Chain = FuncTLVGet.getValue(1);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// TLS calls preserve all registers except those that absolutely must be
	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
	// silly).
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getTLSCallPreservedMask();
	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);

	// Finally, we can make the call. This is just a degenerate version of a
	// normal AArch64 call node: x0 takes the address of the descriptor, and
	// returns the address of the variable in this thread.
	Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
	}

	/// When accessing thread-local variables under either the general-dynamic or
	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
	/// is a function pointer to carry out the resolution.
	///
	/// The sequence is:
	/// adrp x0, :tlsdesc:var
	/// ldr x1, [x0, #:tlsdesc_lo12:var]
	/// add x0, x0, #:tlsdesc_lo12:var
	/// .tlsdesccall var
	/// blr x1
	/// (TPIDR_EL0 offset now in x0)
	///
	/// The above sequence must be produced unscheduled, to enable the linker to
	/// optimize/relax this sequence.
	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
	/// above sequence, and expanded really late in the compilation flow, to ensure
	/// the sequence is produced as per above.
	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	Chain =
	DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
	SDValue Glue = Chain.getValue(1);

	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
	}

	SDValue
	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetELF() && "This function expects an ELF target");
	if (getTargetMachine().getCodeModel() == CodeModel::Large)
	report_fatal_error("ELF TLS only supported in small memory model");
	// Different choices can be made for the maximum size of the TLS area for a
	// module. For the small address model, the default TLS size is 16MiB and the
	// maximum TLS size is 4GiB.
	// FIXME: add -mtls-size command line option and make it control the 16MiB
	// vs. 4GiB code sequence generation.
	// FIXME: add tiny codemodel support. We currently generate the same code as
	// small, which may be larger than needed.
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());

	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
	if (Model == TLSModel::LocalDynamic)
	Model = TLSModel::GeneralDynamic;
	}

	SDValue TPOff;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	const GlobalValue *GV = GA->getGlobal();

	SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);

	if (Model == TLSModel::LocalExec) {
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	SDValue TPWithOff_lo =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
	HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	SDValue TPWithOff =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
	LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return TPWithOff;
	} else if (Model == TLSModel::InitialExec) {
	TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
	} else if (Model == TLSModel::LocalDynamic) {
	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
	// the beginning of the module's TLS region, followed by a DTPREL offset
	// calculation.

	// These accesses will need deduplicating if there's more than one.
	AArch64FunctionInfo *MFI =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
	AArch64II::MO_TLS);

	// Now we can calculate the offset from TPIDR_EL0 to this module's
	// thread-local area.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);

	// Now use :dtprel_whatever: operations to calculate this variable's offset
	// in its thread-storage area.
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	} else if (Model == TLSModel::GeneralDynamic) {
	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);

	// Finally we can make a call to calculate the offset from tpidr_el0.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
	} else
	llvm_unreachable("Unsupported ELF TLS access model");

	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}

	SDValue
	AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");

	SDValue Chain = DAG.getEntryNode();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);

	// Load the ThreadLocalStoragePointer from the TEB
	// A pointer to the TLS array is located at offset 0x58 from the TEB.
	SDValue TLSArray =
	DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
	TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
	Chain = TLSArray.getValue(1);

	// Load the TLS index from the C runtime;
	// This does the same as getAddr(), but without having a GlobalAddressSDNode.
	// This also does the same as LOADgot, but using a generic i32 load,
	// while LOADgot only loads i64.
	SDValue TLSIndexHi =
	DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
	SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
	"_tls_index", PtrVT, AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
	SDValue TLSIndex =
	DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
	TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
	Chain = TLSIndex.getValue(1);

	// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
	// offset into the TLSArray.
	TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
	SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
	DAG.getConstant(3, DL, PtrVT));
	SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
	DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
	MachinePointerInfo());
	Chain = TLS.getValue(1);

	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GA->getGlobal();
	SDValue TGAHi = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue TGALo = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	// Add the offset from the start of the .tls section (section base).
	SDValue Addr =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
	return Addr;
	}

	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	if (Subtarget->isTargetDarwin())
	return LowerDarwinGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetELF())
	return LowerELFGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetWindows())
	return LowerWindowsGlobalTLSAddress(Op, DAG);

	llvm_unreachable("Unexpected platform trying to use TLS");
	}

	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	MachineFunction &MF = DAG.getMachineFunction();
	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
	// will not be produced, as they are conditional branch instructions that do
	// not set flags.
	bool ProduceNonFlagSettingCondBr =
	!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);

	// Handle f128 first, since lowering it will result in comparing the return
	// value of a libcall against zero, which is just what the rest of LowerBR_CC
	// is expecting to deal with.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
	// instruction.
	if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
	return SDValue();

	// The actual operation with overflow check.
	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);

	if (CC == ISD::SETNE)
	OFCC = getInvertedCondCode(OFCC);
	SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);

	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	// If the RHS of the comparison is zero, we can potentially fold this
	// to a specialized branch.
	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
	if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
	if (CC == ISD::SETEQ) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETNE) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}
	}
	if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
	LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}

	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue BR1 =
	DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
	Cmp);
	}

	return BR1;
	}

	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue In1 = Op.getOperand(0);
	SDValue In2 = Op.getOperand(1);
	EVT SrcVT = In2.getValueType();

	if (SrcVT.bitsLT(VT))
	In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
	else if (SrcVT.bitsGT(VT))
	In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));

	EVT VecVT;
	uint64_t EltMask;
	SDValue VecVal1, VecVal2;

	auto setVecVal = [&] (int Idx) {
	if (!VT.isVector()) {
	VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
	DAG.getUNDEF(VecVT), In1);
	VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
	DAG.getUNDEF(VecVT), In2);
	} else {
	VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
	VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
	}
	};

	if (VT == MVT::f32 \|\| VT == MVT::v2f32 \|\| VT == MVT::v4f32) {
	VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
	EltMask = 0x80000000ULL;
	setVecVal(AArch64::ssub);
	} else if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	VecVT = MVT::v2i64;

	// We want to materialize a mask with the high bit set, but the AdvSIMD
	// immediate moves cannot materialize that in a single instruction for
	// 64-bit elements. Instead, materialize zero and then negate it.
	EltMask = 0;

	setVecVal(AArch64::dsub);
	} else if (VT == MVT::f16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v8f16) {
	VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
	EltMask = 0x8000ULL;
	setVecVal(AArch64::hsub);
	} else {
	llvm_unreachable("Invalid type for copysign!");
	}

	SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);

	// If we couldn't materialize the mask above, then the mask vector will be
	// the zero vector, and we need to negate it here.
	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
	}

	SDValue Sel =
	DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);

	if (VT == MVT::f16)
	return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
	if (VT == MVT::f32)
	return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
	else if (VT == MVT::f64)
	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
	else
	return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
	}

	SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat))
	return SDValue();

	if (!Subtarget->hasNEON())
	return SDValue();

	// While there is no integer popcount instruction, it can
	// be more efficiently lowered to the following sequence that uses
	// AdvSIMD registers/instructions as long as the copies to/from
	// the AdvSIMD registers are cheap.
	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
	// CNT V0.8B, V0.8B // 8xbyte pop-counts
	// ADDV B0, V0.8B // sum 8xbyte pop-counts
	// UMOV X0, V0.B[0] // copy byte result back to integer reg
	SDValue Val = Op.getOperand(0);
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	if (VT == MVT::i32)
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);

	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
	SDValue UaddLV = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	if (VT == MVT::i64)
	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
	return UaddLV;
	}

	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
	"Unexpected type for custom ctpop lowering");

	EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
	Val = DAG.getBitcast(VT8Bit, Val);
	Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);

	// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
	unsigned EltSize = 8;
	unsigned NumElts = VT.is64BitVector() ? 8 : 16;
	while (EltSize != VT.getScalarSizeInBits()) {
	EltSize *= 2;
	NumElts /= 2;
	MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
	Val = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
	}

	return Val;
	}

	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	if (Op.getValueType().isVector())
	return LowerVSETCC(Op, DAG);

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc dl(Op);

	// We chose ZeroOrOneBooleanContents, so use zero and one.
	EVT VT = Op.getValueType();
	SDValue TVal = DAG.getConstant(1, dl, VT);
	SDValue FVal = DAG.getConstant(0, dl, VT);

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets picked up by the next if statement.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, use it.
	if (!RHS.getNode()) {
	assert(LHS.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	return LHS;
	}
	}

	if (LHS.getValueType().isInteger()) {
	SDValue CCVal;
	SDValue Cmp =
	getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);

	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
	// and do the comparison.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	if (CC2 == AArch64CC::AL) {
	changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
	} else {
	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
	// totally clean. Some of them require two CSELs to implement. As is in
	// this case, we emit the first CSEL and then emit a second using the output
	// of the first as the RHS. We're effectively OR'ing the two CC's together.

	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
	SDValue RHS, SDValue TVal,
	SDValue FVal, const SDLoc &dl,
	SelectionDAG &DAG) const {
	// Handle f128 first, because it will result in a comparison of some RTLIB
	// call result against zero.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Also handle f16, for which we need to do a f32 comparison.
	if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	}

	// Next, handle integers.
	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	unsigned Opcode = AArch64ISD::CSEL;

	// If both the TVal and the FVal are constants, see if we can swap them in
	// order to for a CSINV or CSINC out of them.
	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	} else if (TVal.getOpcode() == ISD::XOR) {
	// If TVal is a NOT we want to swap TVal and FVal so that we can match
	// with a CSINV rather than a CSEL.
	if (isAllOnesConstant(TVal.getOperand(1))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}
	} else if (TVal.getOpcode() == ISD::SUB) {
	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
	// that we can match with a CSNEG rather than a CSEL.
	if (isNullConstant(TVal.getOperand(0))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}
	} else if (CTVal && CFVal) {
	const int64_t TrueVal = CTVal->getSExtValue();
	const int64_t FalseVal = CFVal->getSExtValue();
	bool Swap = false;

	// If both TVal and FVal are constants, see if FVal is the
	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
	// instead of a CSEL in that case.
	if (TrueVal == ~FalseVal) {
	Opcode = AArch64ISD::CSINV;
	} else if (TrueVal == -FalseVal) {
	Opcode = AArch64ISD::CSNEG;
	} else if (TVal.getValueType() == MVT::i32) {
	// If our operands are only 32-bit wide, make sure we use 32-bit
	// arithmetic for the check whether we can use CSINC. This ensures that
	// the addition in the check will wrap around properly in case there is
	// an overflow (which would not be the case if we do the check with
	// 64-bit arithmetic).
	const uint32_t TrueVal32 = CTVal->getZExtValue();
	const uint32_t FalseVal32 = CFVal->getZExtValue();

	if ((TrueVal32 == FalseVal32 + 1) \|\| (TrueVal32 + 1 == FalseVal32)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal32 > FalseVal32) {
	Swap = true;
	}
	}
	// 64-bit check whether we can use CSINC.
	} else if ((TrueVal == FalseVal + 1) \|\| (TrueVal + 1 == FalseVal)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal > FalseVal) {
	Swap = true;
	}
	}

	// Swap TVal and FVal if necessary.
	if (Swap) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}

	if (Opcode != AArch64ISD::CSEL) {
	// Drop FVal since we can get its value by simply inverting/negating
	// TVal.
	FVal = TVal;
	}
	}

	// Avoid materializing a constant when possible by reusing a known value in
	// a register. However, don't perform this optimization if the known value
	// is one, zero or negative one in the case of a CSEL. We can always
	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
	// FVal, respectively.
	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
	!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
	// "a != C ? x : a" to avoid materializing C.
	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
	TVal = LHS;
	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
	FVal = LHS;
	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
	// avoid materializing C.
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
	Opcode = AArch64ISD::CSINV;
	TVal = LHS;
	FVal = DAG.getConstant(0, dl, FVal.getValueType());
	}
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	EVT VT = TVal.getValueType();
	return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);
	assert(LHS.getValueType() == RHS.getValueType());
	EVT VT = TVal.getValueType();
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two CSELs to implement.
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);

	if (DAG.getTarget().Options.UnsafeFPMath) {
	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
	if (RHSVal && RHSVal->isZero()) {
	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);

	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
	TVal = LHS;
	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
	CFVal && CFVal->isZero() &&
	FVal.getValueType() == LHS.getValueType())
	FVal = LHS;
	}
	}

	// Emit first, and possibly only, CSEL.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	// If we need a second CSEL, emit it, using the output of the first as the
	// RHS. We're effectively OR'ing the two CC's together.
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}

	// Otherwise, return the output of the first CSEL.
	return CS1;
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue TVal = Op.getOperand(2);
	SDValue FVal = Op.getOperand(3);
	SDLoc DL(Op);
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue CCVal = Op->getOperand(0);
	SDValue TVal = Op->getOperand(1);
	SDValue FVal = Op->getOperand(2);
	SDLoc DL(Op);

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
	// instruction.
	if (isOverflowIntrOpRes(CCVal)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
	return SDValue();

	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);

	return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}

	// Lower it the same way as we would lower a SELECT_CC node.
	ISD::CondCode CC;
	SDValue LHS, RHS;
	if (CCVal.getOpcode() == ISD::SETCC) {
	LHS = CCVal.getOperand(0);
	RHS = CCVal.getOperand(1);
	CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
	} else {
	LHS = CCVal;
	RHS = DAG.getConstant(0, DL, CCVal.getValueType());
	CC = ISD::SETNE;
	}
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(JT, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(JT, DAG);
	}
	return getAddr(JT, DAG);
	}

	SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	SDLoc DL(Op);
	SDValue JT = Op.getOperand(1);
	SDValue Entry = Op.getOperand(2);
	int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();

	SDNode *Dest =
	DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
	Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
	return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
	SDValue(Dest, 0));
	}

	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	// Use the GOT for the large code model on iOS.
	if (Subtarget->isTargetMachO()) {
	return getGOT(CP, DAG);
	}
	return getAddrLarge(CP, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(CP, DAG);
	} else {
	return getAddr(CP, DAG);
	}
	}

	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(BA, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(BA, DAG);
	}
	return getAddr(BA, DAG);
	}

	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
	? FuncInfo->getVarArgsGPRIndex()
	: FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	// The layout of the va_list struct is specified in the AArch64 Procedure Call
	// Standard, section B.3.
	MachineFunction &MF = DAG.getMachineFunction();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue Chain = Op.getOperand(0);
	SDValue VAList = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SmallVector<SDValue, 4> MemOps;

	// void *__stack at offset 0
	SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
	MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
	MachinePointerInfo(SV), /* Alignment = */ 8));

	// void *__gr_top at offset 8
	int GPRSize = FuncInfo->getVarArgsGPRSize();
	if (GPRSize > 0) {
	SDValue GRTop, GRTopAddr;

	GRTopAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));

	GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
	GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
	DAG.getConstant(GPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
	MachinePointerInfo(SV, 8),
	/* Alignment = */ 8));
	}

	// void *__vr_top at offset 16
	int FPRSize = FuncInfo->getVarArgsFPRSize();
	if (FPRSize > 0) {
	SDValue VRTop, VRTopAddr;
	VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(16, DL, PtrVT));

	VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
	VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
	DAG.getConstant(FPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
	MachinePointerInfo(SV, 16),
	/* Alignment = */ 8));
	}

	// int __gr_offs at offset 24
	SDValue GROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
	MachinePointerInfo(SV, 24), /* Alignment = */ 4));

	// int __vr_offs at offset 28
	SDValue VROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
	MachinePointerInfo(SV, 28), /* Alignment = */ 4));

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
	return LowerWin64_VASTART(Op, DAG);
	else if (Subtarget->isTargetDarwin())
	return LowerDarwin_VASTART(Op, DAG);
	else
	return LowerAAPCS_VASTART(Op, DAG);
	}

	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
	SelectionDAG &DAG) const {
	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
	// pointer.
	SDLoc DL(Op);
	unsigned VaListSize =
	Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows() ? 8 : 32;
	const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

	return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
	Op.getOperand(2),
	DAG.getConstant(VaListSize, DL, MVT::i32),
	8, false, false, false, MachinePointerInfo(DestSV),
	MachinePointerInfo(SrcSV));
	}

	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"automatic va_arg instruction only works on Darwin");

	const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue Chain = Op.getOperand(0);
	SDValue Addr = Op.getOperand(1);
	unsigned Align = Op.getConstantOperandVal(3);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
	Chain = VAList.getValue(1);

	if (Align > 8) {
	assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
	VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(Align - 1, DL, PtrVT));
	VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
	DAG.getConstant(-(int64_t)Align, DL, PtrVT));
	}

	Type ArgTy = VT.getTypeForEVT(DAG.getContext());
	uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

	// Scalar integer and FP values smaller than 64 bits are implicitly extended
	// up to 64 bits. At the very least, we have to increase the striding of the
	// vaargs list to match this, and for FP values we need to introduce
	// FP_ROUND nodes as well.
	if (VT.isInteger() && !VT.isVector())
	ArgSize = 8;
	bool NeedFPTrunc = false;
	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
	ArgSize = 8;
	NeedFPTrunc = true;
	}

	// Increment the pointer, VAList, to the next vaarg
	SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(ArgSize, DL, PtrVT));
	// Store the incremented VAList to the legalized pointer
	SDValue APStore =
	DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));

	// Load the actual argument out of the pointer VAList
	if (NeedFPTrunc) {
	// Load the value as an f64.
	SDValue WideFP =
	DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
	// Round the value down to an f32.
	SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
	DAG.getIntPtrConstant(1, DL));
	SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
	// Merge the rounded value with the chain output of the load.
	return DAG.getMergeValues(Ops, DL);
	}

	return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
	}

	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

	EVT VT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	int FI = MFI.CreateFixedObject(4, 0, false);
	return DAG.getFrameIndex(FI, VT);
	}

	#define GET_REGISTER_MATCHER
	#include "AArch64GenAsmMatcher.inc"

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	unsigned Reg = MatchRegisterName(RegName);
	if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
	const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
	unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
	if (!Subtarget->isXRegisterReserved(DwarfRegNum))
	Reg = 0;
	}
	if (Reg)
	return Reg;
	report_fatal_error(Twine("Invalid register name \""
	+ StringRef(RegName) + "\"."));
	}

	SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));

	return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
	}

	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return LR, which contains the return address. Mark it an implicit live-in.
	unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
	}

	/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;

	assert(Op.getOpcode() == ISD::SRA_PARTS \|\| Op.getOpcode() == ISD::SRL_PARTS);

	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	HiBitsForLo =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	HiBitsForLo, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));

	SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
	SDValue LoForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	// AArch64 shifts larger than the register width are wrapped rather than
	// clamped, so we can't just emit "hi >> x".
	SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
	SDValue HiForBigShift =
	Opc == ISD::SRA
	? DAG.getNode(Opc, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i64))
	: DAG.getConstant(0, dl, VT);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);

	assert(Op.getOpcode() == ISD::SHL_PARTS);
	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	LoBitsForHi =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	LoBitsForHi, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));
	SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
	SDValue HiForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);

	SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	// AArch64 shifts of larger than register sizes are wrapped rather than
	// clamped, so we can't just emit "lo << a" if a is too big.
	SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
	SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	bool AArch64TargetLowering::isOffsetFoldingLegal(
	const GlobalAddressSDNode *GA) const {
	// Offsets are folded in the DAG combine rather than here so that we can
	// intelligently choose an offset based on the uses.
	return false;
	}

	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool OptForSize) const {
	bool IsLegal = false;
	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
	// 16-bit case when target has full fp16 support.
	// FIXME: We should be able to handle f128 as well with a clever lowering.
	const APInt ImmInt = Imm.bitcastToAPInt();
	if (VT == MVT::f64)
	IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	else if (VT == MVT::f32)
	IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	else if (VT == MVT::f16 && Subtarget->hasFullFP16())
	IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	// TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
	// generate that fmov.

	// If we can not materialize in immediate field for fmov, check if the
	// value can be encoded as the immediate operand of a logical instruction.
	// The immediate value will be created with either MOVZ, MOVN, or ORR.
	if (!IsLegal && (VT == MVT::f64 \|\| VT == MVT::f32)) {
	// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
	// however the mov+fmov sequence is always better because of the reduced
	// cache pressure. The timings are still the same if you consider
	// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
	// movw+movk is fused). So we limit up to 2 instrdduction at most.
	SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
	AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
	Insn);
	unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
	IsLegal = Insn.size() <= Limit;
	}

	LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
	<< " imm value: "; Imm.dump(););
	return IsLegal;
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Optimization Hooks
	//===----------------------------------------------------------------------===//

	static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
	SDValue Operand, SelectionDAG &DAG,
	int &ExtraSteps) {
	EVT VT = Operand.getValueType();
	if (ST->hasNEON() &&
	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\|
	VT == MVT::v2f32 \|\| VT == MVT::v4f32)) {
	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
	// For the reciprocal estimates, convergence is quadratic, so the number
	// of digits is doubled after each iteration. In ARMv8, the accuracy of
	// the initial estimate is 2^-8. Thus the number of extra steps to refine
	// the result for float (23 mantissa bits) is 2 and for double (52
	// mantissa bits) is 3.
	ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;

	return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps,
	bool &UseOneConst,
	bool Reciprocal) const {
	if (Enabled == ReciprocalEstimate::Enabled \|\|
	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setAllowReassociation(true);

	// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
	// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
	Flags);
	Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}
	if (!Reciprocal) {
	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);
	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);

	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
	// Correct the result if the operand is 0.0.
	Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
	VT, Eq, Operand, Estimate);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps) const {
	if (Enabled == ReciprocalEstimate::Enabled)
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setAllowReassociation(true);

	// Newton reciprocal iteration: E * (2 - X * E)
	// AArch64 reciprocal iteration instruction: (2 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
	Estimate, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Table of Constraints
	// TODO: This is the current set of constraints supported by ARM for the
	// compiler, not all of them may make sense.
	//
	// r - A general register
	// w - An FP/SIMD register of some size in the range v0-v31
	// x - An FP/SIMD register of some size in the range v0-v15
	// I - Constant that can be used with an ADD instruction
	// J - Constant that can be used with a SUB instruction
	// K - Constant that can be used with a 32-bit logical instruction
	// L - Constant that can be used with a 64-bit logical instruction
	// M - Constant that can be used as a 32-bit MOV immediate
	// N - Constant that can be used as a 64-bit MOV immediate
	// Q - A memory reference with base register and no offset
	// S - A symbolic address
	// Y - Floating point constant zero
	// Z - Integer constant zero
	//
	// Note that general register operands will be output using their 64-bit x
	// register name, whatever the size of the variable, unless the asm operand
	// is prefixed by the %w modifier. Floating-point and SIMD register operands
	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
	// %q modifier.
	const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
	// At this point, we have to lower this constraint to something else, so we
	// lower it to an "r" or "w". However, by doing this we will force the result
	// to be in register, while the X constraint is much more permissive.
	//
	// Although we are correct (we are free to emit anything, without
	// constraints), we might break use cases that would expect us to be more
	// efficient and emit something else.
	if (!Subtarget->hasFPARMv8())
	return "r";

	if (ConstraintVT.isFloatingPoint())
	return "w";

	if (ConstraintVT.isVector() &&
	(ConstraintVT.getSizeInBits() == 64 \|\|
	ConstraintVT.getSizeInBits() == 128))
	return "w";

	return "r";
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	AArch64TargetLowering::ConstraintType
	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default:
	break;
	case 'x':
	case 'w':
	return C_RegisterClass;
	// An address with a single base register. Due to the way we
	// currently handle addresses it is the same as 'r'.
	case 'Q':
	return C_Memory;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'Y':
	case 'Z':
	return C_Immediate;
	case 'z':
	case 'S': // A symbolic address
	return C_Other;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	AArch64TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'x':
	case 'w':
	if (type->isFloatingPointTy() \|\| type->isVectorTy())
	weight = CW_Register;
	break;
	case 'z':
	weight = CW_Constant;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	AArch64TargetLowering::getRegForInlineAsmConstraint(
	const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::GPR64commonRegClass);
	return std::make_pair(0U, &AArch64::GPR32commonRegClass);
	case 'w':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.getSizeInBits() == 16)
	return std::make_pair(0U, &AArch64::FPR16RegClass);
	if (VT.getSizeInBits() == 32)
	return std::make_pair(0U, &AArch64::FPR32RegClass);
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::FPR64RegClass);
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128RegClass);
	break;
	// The instructions that this constraint is designed for can
	// only take 128-bit registers so just use that regclass.
	case 'x':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128_loRegClass);
	break;
	}
	}
	if (StringRef("{cc}").equals_lower(Constraint))
	return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass *> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	unsigned Size = Constraint.size();
	if ((Size == 4 \|\| Size == 5) && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
	int RegNo;
	bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
	if (!Failed && RegNo >= 0 && RegNo <= 31) {
	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
	// By default we'll emit v0-v31 for this unless there's a modifier where
	// we'll emit the correct register as well.
	if (VT != MVT::Other && VT.getSizeInBits() == 64) {
	Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR64RegClass;
	} else {
	Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR128RegClass;
	}
	}
	}
	}

	if (Res.second && !Subtarget->hasFPARMv8() &&
	!AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
	!AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
	return std::make_pair(0U, nullptr);

	return Res;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void AArch64TargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Currently only support length 1 constraints.
	if (Constraint.length() != 1)
	return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default:
	break;

	// This set of constraints deal with valid constants for various instructions.
	// Validate and return a target constant for them if we can.
	case 'z': {
	// 'z' maps to xzr or wzr so it needs an input of 0.
	if (!isNullConstant(Op))
	return;

	if (Op.getValueType() == MVT::i64)
	Result = DAG.getRegister(AArch64::XZR, MVT::i64);
	else
	Result = DAG.getRegister(AArch64::WZR, MVT::i32);
	break;
	}
	case 'S': {
	// An absolute symbolic address or label reference.
	if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
	Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
	GA->getValueType(0));
	} else if (const BlockAddressSDNode *BA =
	dyn_cast<BlockAddressSDNode>(Op)) {
	Result =
	DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
	} else if (const ExternalSymbolSDNode *ES =
	dyn_cast<ExternalSymbolSDNode>(Op)) {
	Result =
	DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
	} else
	return;
	break;
	}

	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return;

	// Grab the value and do some validation.
	uint64_t CVal = C->getZExtValue();
	switch (ConstraintLetter) {
	// The I constraint applies only to simple ADD or SUB immediate operands:
	// i.e. 0 to 4095 with optional shift by 12
	// The J constraint applies only to ADD or SUB immediates that would be
	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
	// instruction [or vice versa], in other words -1 to -4095 with optional
	// left shift by 12.
	case 'I':
	if (isUInt<12>(CVal) \|\| isShiftedUInt<12, 12>(CVal))
	break;
	return;
	case 'J': {
	uint64_t NVal = -C->getSExtValue();
	if (isUInt<12>(NVal) \|\| isShiftedUInt<12, 12>(NVal)) {
	CVal = C->getSExtValue();
	break;
	}
	return;
	}
	// The K and L constraints apply only to logical immediates, including
	// what used to be the MOVI alias for ORR (though the MOVI alias has now
	// been removed and MOV should be used). So these constraints have to
	// distinguish between bit patterns that are valid 32-bit or 64-bit
	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
	// versa.
	case 'K':
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	return;
	case 'L':
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	return;
	// The M and N constraints are a superset of K and L respectively, for use
	// with the MOV (immediate) alias. As well as the logical immediates they
	// also match 32 or 64-bit immediates that can be loaded either using a
	// single MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
	// (M) or 64-bit 0x1234000000000000 (N) etc.
	// As a note some of this code is liberally stolen from the asm parser.
	case 'M': {
	if (!isUInt<32>(CVal))
	return;
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	if ((CVal & 0xFFFF) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	uint64_t NCVal = ~(uint32_t)CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	return;
	}
	case 'N': {
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	if ((CVal & 0xFFFFULL) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF00000000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF000000000000ULL) == CVal)
	break;
	uint64_t NCVal = ~CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF00000000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
	break;
	return;
	}
	default:
	return;
	}

	// All assembler immediates are 64-bit integers.
	Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
	break;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Advanced SIMD Support
	//===----------------------------------------------------------------------===//

	/// WidenVector - Given a value in the V64 register class, produce the
	/// equivalent value in the V128 register class.
	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
	EVT VT = V64Reg.getValueType();
	unsigned NarrowSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
	SDLoc DL(V64Reg);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
	V64Reg, DAG.getConstant(0, DL, MVT::i32));
	}

	/// getExtFactor - Determine the adjustment factor for the position when
	/// generating an "extract from vector registers" instruction.
	static unsigned getExtFactor(SDValue &V) {
	EVT EltType = V.getValueType().getVectorElementType();
	return EltType.getSizeInBits() / 8;
	}

	/// NarrowVector - Given a value in the V128 register class, produce the
	/// equivalent value in the V64 register class.
	static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
	EVT VT = V128Reg.getValueType();
	unsigned WideSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
	SDLoc DL(V128Reg);

	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
	}

	// Gather data to see if the operation can be modelled as a
	// shuffle in combination with VEXTs.
	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	struct ShuffleSourceInfo {
	SDValue Vec;
	unsigned MinElt;
	unsigned MaxElt;

	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
	// be compatible with the shuffle we intend to construct. As a result
	// ShuffleVec will be some sliding window into the original Vec.
	SDValue ShuffleVec;

	// Code should guarantee that element i in Vec starts at element "WindowBase
	// + i * WindowScale in ShuffleVec".
	int WindowBase;
	int WindowScale;

	ShuffleSourceInfo(SDValue Vec)
	: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
	ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}

	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
	};

	// First gather all vectors used as an immediate source for this BUILD_VECTOR
	// node.
	SmallVector<ShuffleSourceInfo, 2> Sources;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(V.getOperand(1))) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: "
	"a shuffle can only come from building a vector from "
	"various elements of other vectors, provided their "
	"indices are constant\n");
	return SDValue();
	}

	// Add this element source to the list if it's not already there.
	SDValue SourceVec = V.getOperand(0);
	auto Source = find(Sources, SourceVec);
	if (Source == Sources.end())
	Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));

	// Update the minimum and maximum lane number seen.
	unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
	Source->MinElt = std::min(Source->MinElt, EltNo);
	Source->MaxElt = std::max(Source->MaxElt, EltNo);
	}

	if (Sources.size() > 2) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: currently only do something sane when at "
	"most two source vectors are involved\n");
	return SDValue();
	}

	// Find out the smallest element size among result and two sources, and use
	// it as element size to build the shuffle_vector.
	EVT SmallestEltTy = VT.getVectorElementType();
	for (auto &Source : Sources) {
	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
	if (SrcEltTy.bitsLT(SmallestEltTy)) {
	SmallestEltTy = SrcEltTy;
	}
	}
	unsigned ResMultiplier =
	VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
	NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
	EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);

	// If the source vector is too wide or too narrow, we may nevertheless be able
	// to construct a compatible shuffle either by concatenating it with UNDEF or
	// extracting a suitable range of elements.
	for (auto &Src : Sources) {
	EVT SrcVT = Src.ShuffleVec.getValueType();

	if (SrcVT.getSizeInBits() == VT.getSizeInBits())
	continue;

	// This stage of the search produces a source with the same element type as
	// the original, but with a total width matching the BUILD_VECTOR output.
	EVT EltVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);

	if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
	assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
	// We can pad out the smaller vector for free, so if it's part of a
	// shuffle...
	Src.ShuffleVec =
	DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
	DAG.getUNDEF(Src.ShuffleVec.getValueType()));
	continue;
	}

	assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());

	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
	return SDValue();
	}

	if (Src.MinElt >= NumSrcElts) {
	// The extraction can just take the second half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	Src.WindowBase = -NumSrcElts;
	} else if (Src.MaxElt < NumSrcElts) {
	// The extraction can just take the first half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	} else {
	// An actual VEXT is needed
	SDValue VEXTSrc1 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	SDValue VEXTSrc2 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);

	Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
	VEXTSrc2,
	DAG.getConstant(Imm, dl, MVT::i32));
	Src.WindowBase = -Src.MinElt;
	}
	}

	// Another possible incompatibility occurs from the vector element types. We
	// can fix this by bitcasting the source vectors to the same type we intend
	// for the shuffle.
	for (auto &Src : Sources) {
	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
	if (SrcEltTy == SmallestEltTy)
	continue;
	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
	Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
	Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
	Src.WindowBase *= Src.WindowScale;
	}

	// Final sanity check before we try to actually produce a shuffle.
	LLVM_DEBUG(for (auto Src
	: Sources)
	assert(Src.ShuffleVec.getValueType() == ShuffleVT););

	// The stars all align, our next step is to produce the mask for the shuffle.
	SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	SDValue Entry = Op.getOperand(i);
	if (Entry.isUndef())
	continue;

	auto Src = find(Sources, Entry.getOperand(0));
	int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();

	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
	// segment.
	EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
	int BitsDefined =
	std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
	int LanesDefined = BitsDefined / BitsPerShuffleLane;

	// This source is expected to fill ResMultiplier lanes of the final shuffle,
	// starting at the appropriate offset.
	int LaneMask = &Mask[i ResMultiplier];

	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
	ExtractBase += NumElts * (Src - Sources.begin());
	for (int j = 0; j < LanesDefined; ++j)
	LaneMask[j] = ExtractBase + j;
	}

	// Final check before we try to produce nonsense...
	if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
	LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
	return SDValue();
	}

	SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
	for (unsigned i = 0; i < Sources.size(); ++i)
	ShuffleOps[i] = Sources[i].ShuffleVec;

	SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
	ShuffleOps[1], Mask);
	SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);

	LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
	dbgs() << "Reshuffle, creating node: "; V.dump(););

	return V;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are the same.
	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
	unsigned NumElts = VT.getVectorNumElements();

	// Assume that the first shuffle index is not UNDEF. Fail if it is.
	if (M[0] < 0)
	return false;

	Imm = M[0];

	// If this is a VEXT shuffle, the immediate value is the index of the first
	// element. The other shuffle indices must be the successive elements after
	// the first one.
	unsigned ExpectedElt = Imm;
	for (unsigned i = 1; i < NumElts; ++i) {
	// Increment the expected index. If it wraps around, just follow it
	// back to index zero and keep going.
	++ExpectedElt;
	if (ExpectedElt == NumElts)
	ExpectedElt = 0;

	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if (ExpectedElt != static_cast<unsigned>(M[i]))
	return false;
	}

	return true;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are different.
	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
	unsigned &Imm) {
	// Look for the first non-undef element.
	const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });

	// Benefit form APInt to handle overflow when calculating expected element.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
	APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
	// The following shuffle indices must be the successive elements after the
	// first real element.
	const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
	[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
	if (FirstWrongElt != M.end())
	return false;

	// The index of an EXT is the first element if it is not UNDEF.
	// Watch out for the beginning UNDEFs. The EXT index should be the expected
	// value of the first element. E.g.
	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
	// ExpectedElt is the last mask index plus 1.
	Imm = ExpectedElt.getZExtValue();

	// There are two difference cases requiring to reverse input vectors.
	// For example, for vector <4 x i32> we have the following cases,
	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
	// to reverse two input vectors.
	if (Imm < NumElts)
	ReverseEXT = true;
	else
	Imm -= NumElts;

	return true;
	}

	/// isREVMask - Check if a vector shuffle corresponds to a REV
	/// instruction with the specified blocksize. (The order of the elements
	/// within each block of the vector is reversed.)
	static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
	assert((BlockSize == 16 \|\| BlockSize == 32 \|\| BlockSize == 64) &&
	"Only possible block sizes for REV are: 16, 32, 64");

	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	unsigned BlockElts = M[0] + 1;
	// If the first shuffle index is UNDEF, be optimistic.
	if (M[0] < 0)
	BlockElts = BlockSize / EltSz;

	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
	return false;

	for (unsigned i = 0; i < NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
	return false;
	}

	return true;
	}

	static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
	return false;
	Idx += 1;
	}

	return true;
	}

	static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != 2 * i + WhichResult)
	return false;
	}

	return true;
	}

	static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
	return false;
	}
	return true;
	}

	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
	return false;
	Idx += 1;
	}

	return true;
	}

	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned Half = VT.getVectorNumElements() / 2;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned j = 0; j != 2; ++j) {
	unsigned Idx = WhichResult;
	for (unsigned i = 0; i != Half; ++i) {
	int MIdx = M[i + j * Half];
	if (MIdx >= 0 && (unsigned)MIdx != Idx)
	return false;
	Idx += 2;
	}
	}

	return true;
	}

	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
	return false;
	}
	return true;
	}

	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
	bool &DstIsLeft, int &Anomaly) {
	if (M.size() != static_cast<size_t>(NumInputElements))
	return false;

	int NumLHSMatch = 0, NumRHSMatch = 0;
	int LastLHSMismatch = -1, LastRHSMismatch = -1;

	for (int i = 0; i < NumInputElements; ++i) {
	if (M[i] == -1) {
	++NumLHSMatch;
	++NumRHSMatch;
	continue;
	}

	if (M[i] == i)
	++NumLHSMatch;
	else
	LastLHSMismatch = i;

	if (M[i] == i + NumInputElements)
	++NumRHSMatch;
	else
	LastRHSMismatch = i;
	}

	if (NumLHSMatch == NumInputElements - 1) {
	DstIsLeft = true;
	Anomaly = LastLHSMismatch;
	return true;
	} else if (NumRHSMatch == NumInputElements - 1) {
	DstIsLeft = false;
	Anomaly = LastRHSMismatch;
	return true;
	}

	return false;
	}

	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
	if (VT.getSizeInBits() != 128)
	return false;

	unsigned NumElts = VT.getVectorNumElements();

	for (int I = 0, E = NumElts / 2; I != E; I++) {
	if (Mask[I] != I)
	return false;
	}

	int Offset = NumElts / 2;
	for (int I = NumElts / 2, E = NumElts; I != E; I++) {
	if (Mask[I] != I + SplitLHS * Offset)
	return false;
	}

	return true;
	}

	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue V0 = Op.getOperand(0);
	SDValue V1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();

	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
	return SDValue();

	bool SplitV0 = V0.getValueSizeInBits() == 128;

	if (!isConcatMask(Mask, VT, SplitV0))
	return SDValue();

	EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	if (SplitV0) {
	V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
	DAG.getConstant(0, DL, MVT::i64));
	}
	if (V1.getValueSizeInBits() == 128) {
	V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
	DAG.getConstant(0, DL, MVT::i64));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VREV,
	OP_VDUP0,
	OP_VDUP1,
	OP_VDUP2,
	OP_VDUP3,
	OP_VEXT1,
	OP_VEXT2,
	OP_VEXT3,
	OP_VUZPL, // VUZP, left result
	OP_VUZPR, // VUZP, right result
	OP_VZIPL, // VZIP, left result
	OP_VZIPR, // VZIP, right result
	OP_VTRNL, // VTRN, left result
	OP_VTRNR // VTRN, right result
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (1 * 9 + 2) * 9 + 3)
	return LHS;
	assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
	EVT VT = OpLHS.getValueType();

	switch (OpNum) {
	default:
	llvm_unreachable("Unknown shuffle opcode!");
	case OP_VREV:
	// VREV divides the vector in half and swaps within the half.
	if (VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::f32)
	return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
	// vrev <4 x i16> -> REV32
	if (VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::f16)
	return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
	// vrev <4 x i8> -> REV16
	assert(VT.getVectorElementType() == MVT::i8);
	return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
	case OP_VDUP0:
	case OP_VDUP1:
	case OP_VDUP2:
	case OP_VDUP3: {
	EVT EltTy = VT.getVectorElementType();
	unsigned Opcode;
	if (EltTy == MVT::i8)
	Opcode = AArch64ISD::DUPLANE8;
	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16)
	Opcode = AArch64ISD::DUPLANE16;
	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
	Opcode = AArch64ISD::DUPLANE32;
	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
	Opcode = AArch64ISD::DUPLANE64;
	else
	llvm_unreachable("Invalid vector element type?");

	if (VT.getSizeInBits() == 64)
	OpLHS = WidenVector(OpLHS, DAG);
	SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
	return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
	}
	case OP_VEXT1:
	case OP_VEXT2:
	case OP_VEXT3: {
	unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
	return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
	DAG.getConstant(Imm, dl, MVT::i32));
	}
	case OP_VUZPL:
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VUZPR:
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPL:
	return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPR:
	return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNL:
	return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNR:
	return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	}
	}

	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
	SelectionDAG &DAG) {
	// Check to see if we can use the TBL instruction.
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	SDLoc DL(Op);

	EVT EltVT = Op.getValueType().getVectorElementType();
	unsigned BytesPerElt = EltVT.getSizeInBits() / 8;

	SmallVector<SDValue, 8> TBLMask;
	for (int Val : ShuffleMask) {
	for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
	unsigned Offset = Byte + Val * BytesPerElt;
	TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
	}
	}

	MVT IndexVT = MVT::v8i8;
	unsigned IndexLen = 8;
	if (Op.getValueSizeInBits() == 128) {
	IndexVT = MVT::v16i8;
	IndexLen = 16;
	}

	SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
	SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);

	SDValue Shuffle;
	if (V2.getNode()->isUndef()) {
	if (IndexLen == 8)
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	if (IndexLen == 8) {
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
	// cannot currently represent the register constraints on the input
	// table registers.
	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
	// IndexLen));
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
	V2Cst, DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	}
	}
	return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
	}

	static unsigned getDUPLANEOp(EVT EltType) {
	if (EltType == MVT::i8)
	return AArch64ISD::DUPLANE8;
	if (EltType == MVT::i16 \|\| EltType == MVT::f16)
	return AArch64ISD::DUPLANE16;
	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
	return AArch64ISD::DUPLANE32;
	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
	return AArch64ISD::DUPLANE64;

	llvm_unreachable("Invalid vector element type?");
	}

	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

	// Convert shuffles that are directly supported on NEON to target-specific
	// DAG nodes, instead of keeping them as shuffles and matching them again
	// during code selection. This is more efficient and avoids the possibility
	// of inconsistencies between legalization and selection.
	ArrayRef<int> ShuffleMask = SVN->getMask();

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);

	if (SVN->isSplat()) {
	int Lane = SVN->getSplatIndex();
	// If this is undef splat, generate it via "just" vdup, if possible.
	if (Lane == -1)
	Lane = 0;

	if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
	V1.getOperand(0));
	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
	// constant. If so, we can just reference the lane's definition directly.
	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
	!isa<ConstantSDNode>(V1.getOperand(Lane)))
	return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));

	// Otherwise, duplicate from the lane of the input vector.
	unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());

	// SelectionDAGBuilder may have "helpfully" already extracted or conatenated
	// to make a vector of the same size as this SHUFFLE. We can ignore the
	// extract entirely, and canonicalise the concat using WidenVector.
	if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
	V1 = V1.getOperand(0);
	} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
	Lane -= Idx * VT.getVectorNumElements() / 2;
	V1 = WidenVector(V1.getOperand(Idx), DAG);
	} else if (VT.getSizeInBits() == 64)
	V1 = WidenVector(V1, DAG);

	return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
	}

	if (isREVMask(ShuffleMask, VT, 64))
	return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 32))
	return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 16))
	return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);

	bool ReverseEXT = false;
	unsigned Imm;
	if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
	if (ReverseEXT)
	std::swap(V1, V2);
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
	DAG.getConstant(Imm, dl, MVT::i32));
	} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
	DAG.getConstant(Imm, dl, MVT::i32));
	}

	unsigned WhichResult;
	if (isZIPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isUZPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isTRNMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}

	if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}

	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
	return Concat;

	bool DstIsLeft;
	int Anomaly;
	int NumInputElements = V1.getValueType().getVectorNumElements();
	if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
	SDValue DstVec = DstIsLeft ? V1 : V2;
	SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);

	SDValue SrcVec = V1;
	int SrcLane = ShuffleMask[Anomaly];
	if (SrcLane >= NumInputElements) {
	SrcVec = V2;
	SrcLane -= VT.getVectorNumElements();
	}
	SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);

	EVT ScalarVT = VT.getVectorElementType();

	if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
	ScalarVT = MVT::i32;

	return DAG.getNode(
	ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
	DstLaneV);
	}

	// If the shuffle is not directly supported and it has 4 elements, use
	// the PerfectShuffle-generated table to synthesize it from other shuffles.
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 4) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (ShuffleMask[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = ShuffleMask[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	return GenerateTBL(Op, ShuffleMask, DAG);
	}

	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
	APInt &UndefBits) {
	EVT VT = BVN->getValueType(0);
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;

	for (unsigned i = 0; i < NumSplats; ++i) {
	CnstBits <<= SplatBitSize;
	UndefBits <<= SplatBitSize;
	CnstBits \|= SplatBits.zextOrTrunc(VT.getSizeInBits());
	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
	}

	return true;
	}

	return false;
	}

	// Try 64-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;

	if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
	Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);

	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 32-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits,
	const SDValue *LHS = nullptr) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
	Shift = 0;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
	Shift = 8;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
	Shift = 16;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
	Shift = 24;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov;

	if (LHS)
	Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	else
	Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));

	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 16-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits,
	const SDValue *LHS = nullptr) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
	Shift = 0;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
	Shift = 8;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov;

	if (LHS)
	Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	else
	Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));

	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 32-bit splatted SIMD immediate with shifted ones.
	static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
	SelectionDAG &DAG, const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
	Shift = 264;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
	Shift = 272;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 8-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;

	if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
	Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);

	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try FP splatted SIMD immediate.
	static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	bool isWide = (VT.getSizeInBits() == 128);
	MVT MovTy;
	bool isAdvSIMDModImm = false;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
	MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
	}
	else if (isWide &&
	(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
	MovTy = MVT::v2f64;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Specialized code to quickly find if PotentialBVec is a BuildVector that
	// consists of only the same constant int value, returned in reference arg
	// ConstVal
	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
	uint64_t &ConstVal) {
	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
	if (!Bvec)
	return false;
	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
	if (!FirstElt)
	return false;
	EVT VT = Bvec->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 1; i < NumElts; ++i)
	if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
	return false;
	ConstVal = FirstElt->getZExtValue();
	return true;
	}

	static unsigned getIntrinsicID(const SDNode *N) {
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default:
	return Intrinsic::not_intrinsic;
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	if (IID < Intrinsic::num_intrinsics)
	return IID;
	return Intrinsic::not_intrinsic;
	}
	}
	}

	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
	// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
	// Also, logical shift right -> sri, with the same structure.
	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	SDLoc DL(N);

	// Is the first op an AND?
	const SDValue And = N->getOperand(0);
	if (And.getOpcode() != ISD::AND)
	return SDValue();

	// Is the second op an shl or lshr?
	SDValue Shift = N->getOperand(1);
	// This will have been turned into: AArch64ISD::VSHL vector, #shift
	// or AArch64ISD::VLSHR vector, #shift
	unsigned ShiftOpc = Shift.getOpcode();
	if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
	return SDValue();
	bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;

	// Is the shift amount constant?
	ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	if (!C2node)
	return SDValue();

	// Is the and mask vector all constant?
	uint64_t C1;
	if (!isAllConstantBuildVector(And.getOperand(1), C1))
	return SDValue();

	// Is C1 == ~C2, taking into account how much one can shift elements of a
	// particular size?
	uint64_t C2 = C2node->getZExtValue();
	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
	if (C2 > ElemSizeInBits)
	return SDValue();
	unsigned ElemMask = (1 << ElemSizeInBits) - 1;
	if ((C1 & ElemMask) != (~C2 & ElemMask))
	return SDValue();

	SDValue X = And.getOperand(0);
	SDValue Y = Shift.getOperand(0);

	unsigned Intrin =
	IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
	SDValue ResultSLI =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
	Shift.getOperand(1));

	LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
	LLVM_DEBUG(N->dump(&DAG));
	LLVM_DEBUG(dbgs() << "into: \n");
	LLVM_DEBUG(ResultSLI->dump(&DAG));

	++NumShiftInserts;
	return ResultSLI;
	}

	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
	SelectionDAG &DAG) const {
	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
	if (EnableAArch64SlrGeneration) {
	if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
	return Res;
	}

	EVT VT = Op.getValueType();

	SDValue LHS = Op.getOperand(0);
	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
	if (!BVN) {
	// OR commutes, so try swapping the operands.
	LHS = Op.getOperand(1);
	BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
	}
	if (!BVN)
	return Op;

	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;

	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
	DefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
	DefBits, &LHS)))
	return NewOp;

	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
	UndefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
	UndefBits, &LHS)))
	return NewOp;
	}

	// We can always fall back to a non-immediate OR.
	return Op;
	}

	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
	// be truncated to fit element width.
	static SDValue NormalizeBuildVector(SDValue Op,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT EltTy= VT.getVectorElementType();

	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > 16)
	return Op;

	SmallVector<SDValue, 16> Ops;
	for (SDValue Lane : Op->ops()) {
	// For integer vectors, type legalization would have promoted the
	// operands already. Otherwise, if Op is a floating-point splat
	// (with operands cast to integers), then the only possibilities
	// are constants and UNDEFs.
	if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
	APInt LowBits(EltTy.getSizeInBits(),
	CstLane->getZExtValue());
	Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
	} else if (Lane.getNode()->isUndef()) {
	Lane = DAG.getUNDEF(MVT::i32);
	} else {
	assert(Lane.getValueType() == MVT::i32 &&
	"Unexpected BUILD_VECTOR operand type");
	}
	Ops.push_back(Lane);
	}
	return DAG.getBuildVector(VT, dl, Ops);
	}

	static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;
	if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
	return NewOp;

	DefBits = ~DefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
	return NewOp;

	DefBits = UndefBits;
	if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
	return NewOp;

	DefBits = ~UndefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
	return NewOp;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// Try to build a simple constant vector.
	Op = NormalizeBuildVector(Op, DAG);
	if (VT.isInteger()) {
	// Certain vector constants, used to express things like logical NOT and
	// arithmetic NEG, are passed through unmodified. This allows special
	// patterns for these operations to match, which will lower these constants
	// to whatever is proven necessary.
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	if (BVN->isConstant())
	if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
	unsigned BitSize = VT.getVectorElementType().getSizeInBits();
	APInt Val(BitSize,
	Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
	if (Val.isNullValue() \|\| Val.isAllOnesValue())
	return Op;
	}
	}

	if (SDValue V = ConstantBuildVector(Op, DAG))
	return V;

	// Scan through the operands to find some interesting properties we can
	// exploit:
	// 1) If only one value is used, we can use a DUP, or
	// 2) if only the low element is not undef, we can just insert that, or
	// 3) if only one constant value is used (w/ some non-constant lanes),
	// we can splat the constant value into the whole vector then fill
	// in the non-constant lanes.
	// 4) FIXME: If different constant values are used, but we can intelligently
	// select the values we'll be overwriting for the non-constant
	// lanes such that we can directly materialize the vector
	// some other way (MOVI, e.g.), we can be sneaky.
	// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
	SDLoc dl(Op);
	unsigned NumElts = VT.getVectorNumElements();
	bool isOnlyLowElement = true;
	bool usesOnlyOneValue = true;
	bool usesOnlyOneConstantValue = true;
	bool isConstant = true;
	bool AllLanesExtractElt = true;
	unsigned NumConstantLanes = 0;
	SDValue Value;
	SDValue ConstantValue;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	AllLanesExtractElt = false;
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	if (isa<ConstantSDNode>(V) \|\| isa<ConstantFPSDNode>(V)) {
	++NumConstantLanes;
	if (!ConstantValue.getNode())
	ConstantValue = V;
	else if (ConstantValue != V)
	usesOnlyOneConstantValue = false;
	}

	if (!Value.getNode())
	Value = V;
	else if (V != Value)
	usesOnlyOneValue = false;
	}

	if (!Value.getNode()) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
	return DAG.getUNDEF(VT);
	}

	// Convert BUILD_VECTOR where all elements but the lowest are undef into
	// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
	// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
	if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
	"SCALAR_TO_VECTOR node\n");
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
	}

	if (AllLanesExtractElt) {
	SDNode *Vector = nullptr;
	bool Even = false;
	bool Odd = false;
	// Check whether the extract elements match the Even pattern <0,2,4,...> or
	// the Odd pattern <1,3,5,...>.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	const SDNode *N = V.getNode();
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	break;
	SDValue N0 = N->getOperand(0);

	// All elements are extracted from the same vector.
	if (!Vector) {
	Vector = N0.getNode();
	// Check that the type of EXTRACT_VECTOR_ELT matches the type of
	// BUILD_VECTOR.
	if (VT.getVectorElementType() !=
	N0.getValueType().getVectorElementType())
	break;
	} else if (Vector != N0.getNode()) {
	Odd = false;
	Even = false;
	break;
	}

	// Extracted values are either at Even indices <0,2,4,...> or at Odd
	// indices <1,3,5,...>.
	uint64_t Val = N->getConstantOperandVal(1);
	if (Val == 2 * i) {
	Even = true;
	continue;
	}
	if (Val - 1 == 2 * i) {
	Odd = true;
	continue;
	}

	// Something does not match: abort.
	Odd = false;
	Even = false;
	break;
	}
	if (Even \|\| Odd) {
	SDValue LHS =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
	DAG.getConstant(0, dl, MVT::i64));
	SDValue RHS =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
	DAG.getConstant(NumElts, dl, MVT::i64));

	if (Even && !Odd)
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	if (Odd && !Even)
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	}
	}

	// Use DUP for non-constant splats. For f32 constant splats, reduce to
	// i32 and try again.
	if (usesOnlyOneValue) {
	if (!isConstant) {
	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Value.getValueType() != VT) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
	return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
	}

	// This is actually a DUPLANExx operation, which keeps everything vectory.

	SDValue Lane = Value.getOperand(1);
	Value = Value.getOperand(0);
	if (Value.getValueSizeInBits() == 64) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
	"widening it\n");
	Value = WidenVector(Value, DAG);
	}

	unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
	return DAG.getNode(Opcode, dl, VT, Value, Lane);
	}

	if (VT.getVectorElementType().isFloatingPoint()) {
	SmallVector<SDValue, 8> Ops;
	EVT EltTy = VT.getVectorElementType();
	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::f32 \|\| EltTy == MVT::f64) &&
	"Unsupported floating-point vector type");
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
	"BITCASTS, and try again\n");
	MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
	for (unsigned i = 0; i < NumElts; ++i)
	Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
	SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
	Val.dump(););
	Val = LowerBUILD_VECTOR(Val, DAG);
	if (Val.getNode())
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}
	}

	// If there was only one constant value used and for more than one lane,
	// start by splatting that value, then replace the non-constant lanes. This
	// is better than the default, which will perform a separate initialization
	// for each lane.
	if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
	// Firstly, try to materialize the splat constant.
	SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
	Val = ConstantBuildVector(Vec, DAG);
	if (!Val) {
	// Otherwise, materialize the constant and splat it.
	Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
	DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
	}

	// Now insert the non-constant lanes.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
	// Note that type legalization likely mucked about with the VT of the
	// source operand, so we may have to convert it here before inserting.
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
	}
	return Val;
	}

	// This will generate a load from the constant pool.
	if (isConstant) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
	"expansion\n");
	return SDValue();
	}

	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
	if (NumElts >= 4) {
	if (SDValue shuffle = ReconstructShuffle(Op, DAG))
	return shuffle;
	}

	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
	// know the default expansion would otherwise fall back on something even
	// worse. For a vector with one or two non-undef values, that's
	// scalar_to_vector for the elements followed by a shuffle (provided the
	// shuffle is valid for the target) and materialization element by element
	// on the stack followed by a load for everything else.
	if (!isConstant && !usesOnlyOneValue) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
	"of INSERT_VECTOR_ELT\n");

	SDValue Vec = DAG.getUNDEF(VT);
	SDValue Op0 = Op.getOperand(0);
	unsigned i = 0;

	// Use SCALAR_TO_VECTOR for lane zero to
	// a) Avoid a RMW dependency on the full vector register, and
	// b) Allow the register coalescer to fold away the copy if the
	// value is already in an S or D register, and we're forced to emit an
	// INSERT_SUBREG that we can't fold anywhere.
	//
	// We also allow types like i8 and i16 which are illegal scalar but legal
	// vector element types. After type-legalization the inserted value is
	// extended (i32) and it is safe to cast them to the vector type by ignoring
	// the upper bits of the lowest lane (e.g. v8i8, v4i16).
	if (!Op0.isUndef()) {
	LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
	++i;
	}
	LLVM_DEBUG(if (i < NumElts) dbgs()
	<< "Creating nodes for the other vector elements:\n";);
	for (; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
	}
	return Vec;
	}

	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
	"better alternative\n");
	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
	return SDValue();

	// For V64 types, we perform insertion by expanding the value
	// to a V128 type and perform the insertion on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
	Op.getOperand(1), Op.getOperand(2));
	// Re-narrow the resultant vector.
	return NarrowVector(Node, DAG);
	}

	SDValue
	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
	return SDValue();

	// For V64 types, we perform extraction by expanding the value
	// to a V128 type and perform the extraction on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	EVT ExtrTy = WideTy.getVectorElementType();
	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
	ExtrTy = MVT::i32;

	// For extractions, we just return the result directly.
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
	Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getOperand(0).getValueType();
	SDLoc dl(Op);
	// Just in case...
	if (!VT.isVector())
	return SDValue();

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!Cst)
	return SDValue();
	unsigned Val = Cst->getZExtValue();

	unsigned Size = Op.getValueSizeInBits();

	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
	if (Val == 0)
	return Op;

	// If this is extracting the upper 64-bits of a 128-bit vector, we match
	// that directly.
	if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
	return Op;

	return SDValue();
	}

	bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (VT.getVectorNumElements() == 4 &&
	(VT.is128BitVector() \|\| VT.is64BitVector())) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (M[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = M[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return true;
	}

	bool DummyBool;
	int DummyInt;
	unsigned DummyUnsigned;

	return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) \|\| isREVMask(M, VT, 64) \|\|
	isREVMask(M, VT, 32) \|\| isREVMask(M, VT, 16) \|\|
	isEXTMask(M, VT, DummyBool, DummyUnsigned) \|\|
	// isTBLMask(M, VT) \|\| // FIXME: Port TBL support from ARM.
	isTRNMask(M, VT, DummyUnsigned) \|\| isUZPMask(M, VT, DummyUnsigned) \|\|
	isZIPMask(M, VT, DummyUnsigned) \|\|
	isTRN_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isUZP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isZIP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) \|\|
	isConcatMask(M, VT, VT.getSizeInBits() == 128));
	}

	/// getVShiftImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift operation, where all the elements of the
	/// build_vector must have the same constant integer value.
	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
	// Ignore bit_converts.
	while (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN \|\| !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElementBits) \|\|
	SplatBitSize > ElementBits)
	return false;
	Cnt = SplatBits.getSExtValue();
	return true;
	}

	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift left operation. That value must be in the range:
	/// 0 <= Value < ElementBits for a left shift; or
	/// 0 <= Value <= ElementBits for a long left shift.
	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
	}

	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift right operation. The value must be in the range:
	/// 1 <= Value <= ElementBits for a right shift; or
	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
	}

	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	int64_t Cnt;

	if (!Op.getOperand(1).getValueType().isVector())
	return Op;
	unsigned EltSize = VT.getScalarSizeInBits();

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unexpected shift opcode");

	case ISD::SHL:
	if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
	return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
	MVT::i32),
	Op.getOperand(0), Op.getOperand(1));
	case ISD::SRA:
	case ISD::SRL:
	// Right shift immediate
	if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
	unsigned Opc =
	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
	return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	}

	// Right shift register. Note, there is not a shift right register
	// instruction, but the shift left register instruction takes a signed
	// value, where negative numbers specify a right shift.
	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
	: Intrinsic::aarch64_neon_ushl;
	// negate the shift amount
	SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
	SDValue NegShiftLeft =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
	NegShift);
	return NegShiftLeft;
	}

	return SDValue();
	}

	static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
	AArch64CC::CondCode CC, bool NoNans, EVT VT,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT SrcVT = LHS.getValueType();
	assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	"function only supposed to emit natural comparisons");

	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
	bool IsZero = IsCnst && (CnstBits == 0);

	if (SrcVT.getVectorElementType().isFloatingPoint()) {
	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Fcmeq;
	if (IsZero)
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	else
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
	case AArch64CC::LS:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (!NoNans)
	return SDValue();
	// If we ignore NaNs then we can use to the MI implementation.
	LLVM_FALLTHROUGH;
	case AArch64CC::MI:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
	}
	}

	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Cmeq;
	if (IsZero)
	Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	else
	Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
	case AArch64CC::LE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
	case AArch64CC::LS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
	case AArch64CC::LO:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
	case AArch64CC::HI:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
	case AArch64CC::HS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
	}
	}

	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
	SDLoc dl(Op);

	if (LHS.getValueType().getVectorElementType().isInteger()) {
	assert(LHS.getValueType() == RHS.getValueType());
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
	return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
	}

	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	// Make v4f16 (only) fcmp operations utilise vector instructions
	// v8f16 support will be a litle more complicated
	if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
	if (LHS.getValueType().getVectorNumElements() == 4) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
	SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
	DAG.ReplaceAllUsesWith(Op, NewSetcc);
	CmpVT = MVT::v4i32;
	} else
	return SDValue();
	}

	assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) \|\|
	LHS.getValueType().getVectorElementType() != MVT::f128);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	AArch64CC::CondCode CC1, CC2;
	bool ShouldInvert;
	changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);

	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp.getNode())
	return SDValue();

	if (CC2 != AArch64CC::AL) {
	SDValue Cmp2 =
	EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp2.getNode())
	return SDValue();

	Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
	}

	Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());

	if (ShouldInvert)
	Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());

	return Cmp;
	}

	static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
	SelectionDAG &DAG) {
	SDValue VecOp = ScalarOp.getOperand(0);
	auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
	DAG.getConstant(0, DL, MVT::i64));
	}

	SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	switch (Op.getOpcode()) {
	case ISD::VECREDUCE_ADD:
	return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
	case ISD::VECREDUCE_SMAX:
	return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_SMIN:
	return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
	case ISD::VECREDUCE_UMAX:
	return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_UMIN:
	return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
	case ISD::VECREDUCE_FMAX: {
	assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	case ISD::VECREDUCE_FMIN: {
	assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	default:
	llvm_unreachable("Unhandled reduction");
	}
	}

	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
	SelectionDAG &DAG) const {
	auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
	if (!Subtarget.hasLSE())
	return SDValue();

	// LSE has an atomic load-add instruction, but not a load-sub.
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue RHS = Op.getOperand(2);
	AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
	RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
	Op.getOperand(0), Op.getOperand(1), RHS,
	AN->getMemOperand());
	}

	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
	SelectionDAG &DAG) const {
	auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
	if (!Subtarget.hasLSE())
	return SDValue();

	// LSE has an atomic load-clear instruction, but not a load-and.
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue RHS = Op.getOperand(2);
	AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
	RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
	Op.getOperand(0), Op.getOperand(1), RHS,
	AN->getMemOperand());
	}

	SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
	SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);

	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);

	Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
	DAG.getConstant(4, dl, MVT::i64));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	// To match the actual intent better, we should read the output from X15 here
	// again (instead of potentially spilling it to the stack), but rereading Size
	// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
	// here.

	Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
	DAG.getConstant(4, dl, MVT::i64));
	return Chain;
	}

	SDValue
	AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() &&
	"Only Windows alloca probing supported");
	SDLoc dl(Op);
	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	"no-stack-arg-probe")) {
	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
	SDValue Ops[2] = {SP, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);

	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {SP, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
	/// specified in the intrinsic calls.
	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	auto &DL = I.getModule()->getDataLayout();
	switch (Intrinsic) {
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	// Conservatively set memVT to the entire set of vectors loaded.
	uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	// volatile loads with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane: {
	Info.opc = ISD::INTRINSIC_VOID;
	// Conservatively set memVT to the entire set of vectors stored.
	unsigned NumElts = 0;
	for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
	Type *ArgTy = I.getArgOperand(ArgI)->getType();
	if (!ArgTy->isVectorTy())
	break;
	NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
	}
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	// volatile stores with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::aarch64_stlxr:
	case Intrinsic::aarch64_stxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::aarch64_ldaxp:
	case Intrinsic::aarch64_ldxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = 16;
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	case Intrinsic::aarch64_stlxp:
	case Intrinsic::aarch64_stxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = 16;
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	default:
	break;
	}

	return false;
	}

	bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// TODO: This may be worth removing. Check regression tests for diffs.
	if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
	return false;

	// If we're reducing the load width in order to avoid having to use an extra
	// instruction to do extension then it's probably a good idea.
	if (ExtTy != ISD::NON_EXTLOAD)
	return true;
	// Don't reduce load width if it would prevent us from combining a shift into
	// the offset.
	MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
	assert(Mem);
	const SDValue &Base = Mem->getBasePtr();
	if (Base.getOpcode() == ISD::ADD &&
	Base.getOperand(1).getOpcode() == ISD::SHL &&
	Base.getOperand(1).hasOneUse() &&
	Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
	// The shift can be combined if it matches the size of the value being
	// loaded (and so reducing the width would make it not match).
	uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
	uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
	if (ShiftAmount == Log2_32(LoadBytes))
	return false;
	}
	// We have no reason to disallow reducing the load width, so allow it.
	return true;
	}

	// Truncations from 64-bit GPR to 32-bit GPR is free.
	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}
	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	/// Check if it is profitable to hoist instruction in then/else to if.
	/// Not profitable if I and it's user can form a FMA instruction
	/// because we prefer FMSUB/FMADD.
	bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
	if (I->getOpcode() != Instruction::FMul)
	return true;

	if (!I->hasOneUse())
	return true;

	Instruction *User = I->user_back();

	if (User &&
	!(User->getOpcode() == Instruction::FSub \|\|
	User->getOpcode() == Instruction::FAdd))
	return true;

	const TargetOptions &Options = getTargetMachine().Options;
	const DataLayout &DL = I->getModule()->getDataLayout();
	EVT VT = getValueType(DL, User->getOperand(0)->getType());

	return !(isFMAFasterThanFMulAndFAdd(VT) &&
	isOperationLegalOrCustom(ISD::FMA, VT) &&
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath));
	}

	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
	// 64-bit GPR.
	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}
	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}

	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2)) {
	return true;
	}

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
	VT1.getSizeInBits() <= 32);
	}

	bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
	if (isa<FPExtInst>(Ext))
	return false;

	// Vector types are not free.
	if (Ext->getType()->isVectorTy())
	return false;

	for (const Use &U : Ext->uses()) {
	// The extension is free if we can fold it with a left shift in an
	// addressing mode or an arithmetic operation: add, sub, and cmp.

	// Is there a shift?
	const Instruction *Instr = cast<Instruction>(U.getUser());

	// Is this a constant shift?
	switch (Instr->getOpcode()) {
	case Instruction::Shl:
	if (!isa<ConstantInt>(Instr->getOperand(1)))
	return false;
	break;
	case Instruction::GetElementPtr: {
	gep_type_iterator GTI = gep_type_begin(Instr);
	auto &DL = Ext->getModule()->getDataLayout();
	std::advance(GTI, U.getOperandNo()-1);
	Type *IdxTy = GTI.getIndexedType();
	// This extension will end up with a shift because of the scaling factor.
	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
	// Get the shift amount based on the scaling factor:
	// log2(sizeof(IdxTy)) - log2(8).
	uint64_t ShiftAmt =
	countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
	// Is the constant foldable in the shift of the addressing mode?
	// I.e., shift amount is between 1 and 4 inclusive.
	if (ShiftAmt == 0 \|\| ShiftAmt > 4)
	return false;
	break;
	}
	case Instruction::Trunc:
	// Check if this is a noop.
	// trunc(sext ty1 to ty2) to ty1.
	if (Instr->getType() == Ext->getOperand(0)->getType())
	continue;
	LLVM_FALLTHROUGH;
	default:
	return false;
	}

	// At this point we can use the bfm family, so this extension is free
	// for that use.
	}
	return true;
	}

	/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
	/// or upper half of the vector elements.
	static bool areExtractShuffleVectors(Value Op1, Value Op2) {
	auto areTypesHalfed = [](Value FullV, Value HalfV) {
	auto *FullVT = cast<VectorType>(FullV->getType());
	auto *HalfVT = cast<VectorType>(HalfV->getType());
	return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
	};

	auto extractHalf = [](Value FullV, Value HalfV) {
	auto *FullVT = cast<VectorType>(FullV->getType());
	auto *HalfVT = cast<VectorType>(HalfV->getType());
	return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
	};

	Constant M1, M2;
	Value S1Op1, S2Op1;
	if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) \|\|
	!match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
	return false;

	// Check that the operands are half as wide as the result and we extract
	// half of the elements of the input vectors.
	if (!areTypesHalfed(S1Op1, Op1) \|\| !areTypesHalfed(S2Op1, Op2) \|\|
	!extractHalf(S1Op1, Op1) \|\| !extractHalf(S2Op1, Op2))
	return false;

	// Check the mask extracts either the lower or upper half of vector
	// elements.
	int M1Start = -1;
	int M2Start = -1;
	int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
	if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) \|\|
	!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) \|\|
	M1Start != M2Start \|\| (M1Start != 0 && M2Start != (NumElements / 2)))
	return false;

	return true;
	}

	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
	/// of the vector elements.
	static bool areExtractExts(Value Ext1, Value Ext2) {
	auto areExtDoubled = [](Instruction *Ext) {
	return Ext->getType()->getScalarSizeInBits() ==
	2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
	};

	if (!match(Ext1, m_ZExtOrSExt(m_Value())) \|\|
	!match(Ext2, m_ZExtOrSExt(m_Value())) \|\|
	!areExtDoubled(cast<Instruction>(Ext1)) \|\|
	!areExtDoubled(cast<Instruction>(Ext2)))
	return false;

	return true;
	}

	/// Check if sinking \p I's operands to I's basic block is profitable, because
	/// the operands can be folded into a target instruction, e.g.
	/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
	bool AArch64TargetLowering::shouldSinkOperands(
	Instruction I, SmallVectorImpl<Use > &Ops) const {
	if (!I->getType()->isVectorTy())
	return false;

	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	case Intrinsic::aarch64_neon_umull:
	if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
	return false;
	Ops.push_back(&II->getOperandUse(0));
	Ops.push_back(&II->getOperandUse(1));
	return true;
	default:
	return false;
	}
	}

	switch (I->getOpcode()) {
	case Instruction::Sub:
	case Instruction::Add: {
	if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
	return false;

	// If the exts' operands extract either the lower or upper elements, we
	// can sink them too.
	auto Ext1 = cast<Instruction>(I->getOperand(0));
	auto Ext2 = cast<Instruction>(I->getOperand(1));
	if (areExtractShuffleVectors(Ext1, Ext2)) {
	Ops.push_back(&Ext1->getOperandUse(0));
	Ops.push_back(&Ext2->getOperandUse(0));
	}

	Ops.push_back(&I->getOperandUse(0));
	Ops.push_back(&I->getOperandUse(1));

	return true;
	}
	default:
	return false;
	}
	return false;
	}

	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
	unsigned &RequiredAligment) const {
	if (!LoadedType.isSimple() \|\|
	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
	return false;
	// Cyclone supports unaligned accesses.
	RequiredAligment = 0;
	unsigned NumBits = LoadedType.getSizeInBits();
	return NumBits == 32 \|\| NumBits == 64;
	}

	/// A helper function for determining the number of interleaved accesses we
	/// will generate when lowering accesses of the given type.
	unsigned
	AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
	const DataLayout &DL) const {
	return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
	}

	MachineMemOperand::Flags
	AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
	if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
	I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
	return MOStridedAccess;
	return MachineMemOperand::MONone;
	}

	bool AArch64TargetLowering::isLegalInterleavedAccessType(
	VectorType *VecTy, const DataLayout &DL) const {

	unsigned VecSize = DL.getTypeSizeInBits(VecTy);
	unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());

	// Ensure the number of vector elements is greater than 1.
	if (VecTy->getNumElements() < 2)
	return false;

	// Ensure the element type is legal.
	if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
	return false;

	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
	// 128 will be split into multiple interleaved accesses.
	return VecSize == 64 \|\| VecSize % 128 == 0;
	}

	/// Lower an interleaved load into a ldN intrinsic.
	///
	/// E.g. Lower an interleaved load (Factor = 2):
	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
	///
	/// Into:
	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
	bool AArch64TargetLowering::lowerInterleavedLoad(
	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
	ArrayRef<unsigned> Indices, unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");
	assert(!Shuffles.empty() && "Empty shufflevector input");
	assert(Shuffles.size() == Indices.size() &&
	"Unmatched number of shufflevectors and indices");

	const DataLayout &DL = LI->getModule()->getDataLayout();

	VectorType *VecTy = Shuffles[0]->getType();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(VecTy, DL))
	return false;

	unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);

	// A pointer vector can not be the return type of the ldN intrinsics. Need to
	// load integer vectors first and then convert to pointer vectors.
	Type *EltTy = VecTy->getVectorElementType();
	if (EltTy->isPointerTy())
	VecTy =
	VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());

	IRBuilder<> Builder(LI);

	// The base address of the load.
	Value *BaseAddr = LI->getPointerOperand();

	if (NumLoads > 1) {
	// If we're going to generate more than one load, reset the sub-vector type
	// to something legal.
	VecTy = VectorType::get(VecTy->getVectorElementType(),
	VecTy->getVectorNumElements() / NumLoads);

	// We will compute the pointer operand of each load from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr, VecTy->getVectorElementType()->getPointerTo(
	LI->getPointerAddressSpace()));
	}

	Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
	Type *Tys[2] = {VecTy, PtrTy};
	static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
	Intrinsic::aarch64_neon_ld3,
	Intrinsic::aarch64_neon_ld4};
	Function *LdNFunc =
	Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);

	// Holds sub-vectors extracted from the load intrinsic return values. The
	// sub-vectors are associated with the shufflevector instructions they will
	// replace.
	DenseMap<ShuffleVectorInst , SmallVector<Value , 4>> SubVecs;

	for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {

	// If we're generating more than one load, compute the base address of
	// subsequent loads as an offset from the previous.
	if (LoadCount > 0)
	BaseAddr =
	Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
	VecTy->getVectorNumElements() * Factor);

	CallInst *LdN = Builder.CreateCall(
	LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");

	// Extract and store the sub-vectors returned by the load intrinsic.
	for (unsigned i = 0; i < Shuffles.size(); i++) {
	ShuffleVectorInst *SVI = Shuffles[i];
	unsigned Index = Indices[i];

	Value *SubVec = Builder.CreateExtractValue(LdN, Index);

	// Convert the integer vector to pointer vector if the element is pointer.
	if (EltTy->isPointerTy())
	SubVec = Builder.CreateIntToPtr(
	SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
	VecTy->getVectorNumElements()));
	SubVecs[SVI].push_back(SubVec);
	}
	}

	// Replace uses of the shufflevector instructions with the sub-vectors
	// returned by the load intrinsic. If a shufflevector instruction is
	// associated with more than one sub-vector, those sub-vectors will be
	// concatenated into a single wide vector.
	for (ShuffleVectorInst *SVI : Shuffles) {
	auto &SubVec = SubVecs[SVI];
	auto *WideVec =
	SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
	SVI->replaceAllUsesWith(WideVec);
	}

	return true;
	}

	/// Lower an interleaved store into a stN intrinsic.
	///
	/// E.g. Lower an interleaved store (Factor = 3):
	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	///
	/// Note that the new shufflevectors will be removed and we'll only generate one
	/// st3 instruction in CodeGen.
	///
	/// Example for a more general valid mask (Factor 3). Lower:
	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
	ShuffleVectorInst *SVI,
	unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");

	VectorType *VecTy = SVI->getType();
	assert(VecTy->getVectorNumElements() % Factor == 0 &&
	"Invalid interleaved store");

	unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
	Type *EltTy = VecTy->getVectorElementType();
	VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);

	const DataLayout &DL = SI->getModule()->getDataLayout();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(SubVecTy, DL))
	return false;

	unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);

	Value *Op0 = SVI->getOperand(0);
	Value *Op1 = SVI->getOperand(1);
	IRBuilder<> Builder(SI);

	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
	// vectors to integer vectors.
	if (EltTy->isPointerTy()) {
	Type *IntTy = DL.getIntPtrType(EltTy);
	unsigned NumOpElts = Op0->getType()->getVectorNumElements();

	// Convert to the corresponding integer vector.
	Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
	Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
	Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

	SubVecTy = VectorType::get(IntTy, LaneLen);
	}

	// The base address of the store.
	Value *BaseAddr = SI->getPointerOperand();

	if (NumStores > 1) {
	// If we're going to generate more than one store, reset the lane length
	// and sub-vector type to something legal.
	LaneLen /= NumStores;
	SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);

	// We will compute the pointer operand of each store from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
	SI->getPointerAddressSpace()));
	}

	auto Mask = SVI->getShuffleMask();

	Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
	Type *Tys[2] = {SubVecTy, PtrTy};
	static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
	Intrinsic::aarch64_neon_st3,
	Intrinsic::aarch64_neon_st4};
	Function *StNFunc =
	Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);

	for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {

	SmallVector<Value *, 5> Ops;

	// Split the shufflevector operands into sub vectors for the new stN call.
	for (unsigned i = 0; i < Factor; i++) {
	unsigned IdxI = StoreCount * LaneLen * Factor + i;
	if (Mask[IdxI] >= 0) {
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
	} else {
	unsigned StartMask = 0;
	for (unsigned j = 1; j < LaneLen; j++) {
	unsigned IdxJ = StoreCount * LaneLen * Factor + j;
	if (Mask[IdxJ * Factor + IdxI] >= 0) {
	StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
	break;
	}
	}
	// Note: Filling undef gaps with random elements is ok, since
	// those elements were being written anyway (with undefs).
	// In the case of all undefs we're defaulting to using elems from 0
	// Note: StartMask cannot be negative, it's checked in
	// isReInterleaveMask
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
	}
	}

	// If we generating more than one store, we compute the base address of
	// subsequent stores as an offset from the previous.
	if (StoreCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
	BaseAddr, LaneLen * Factor);

	Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
	Builder.CreateCall(StNFunc, Ops);
	}
	return true;
	}

	static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
	unsigned AlignCheck) {
	return ((SrcAlign == 0 \|\| SrcAlign % AlignCheck == 0) &&
	(DstAlign == 0 \|\| DstAlign % AlignCheck == 0));
	}

	EVT AArch64TargetLowering::getOptimalMemOpType(
	uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const {
	bool CanImplicitFloat =
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
	// taken one instruction to materialize the v2i64 zero and one store (with
	// restrictive addressing mode). Just do i64 stores.
	bool IsSmallMemset = IsMemset && Size < 32;
	auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
	if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
	return true;
	bool Fast;
	return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
	&Fast) &&
	Fast;
	};

	if (CanUseNEON && IsMemset && !IsSmallMemset &&
	AlignmentIsAcceptable(MVT::v2i64, 16))
	return MVT::v2i64;
	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
	return MVT::f128;
	if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
	return MVT::i64;
	if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
	return MVT::i32;
	return MVT::Other;
	}

	// 12-bit optionally shifted immediates are legal for adds.
	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
	if (Immed == std::numeric_limits<int64_t>::min()) {
	LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
	<< ": avoid UB for INT64_MIN\n");
	return false;
	}
	// Same encoding for add/sub, just flip the sign.
	Immed = std::abs(Immed);
	bool IsLegal = ((Immed >> 12) == 0 \|\|
	((Immed & 0xfff) == 0 && Immed >> 24 == 0));
	LLVM_DEBUG(dbgs() << "Is " << Immed
	<< " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
	return IsLegal;
	}

	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
	// immediates is the same as for an add or a sub.
	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
	return isLegalAddImmediate(Immed);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// AArch64 has five basic addressing modes:
	// reg
	// reg + 9-bit signed offset
	// reg + SIZE_IN_BYTES * 12-bit unsigned offset
	// reg1 + reg2
	// reg + SIZE_IN_BYTES * reg

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// No reg+reg+imm addressing.
	if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
	return false;

	// check reg + imm case:
	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
	uint64_t NumBytes = 0;
	if (Ty->isSized()) {
	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
	NumBytes = NumBits / 8;
	if (!isPowerOf2_64(NumBits))
	NumBytes = 0;
	}

	if (!AM.Scale) {
	int64_t Offset = AM.BaseOffs;

	// 9-bit signed offset
	if (isInt<9>(Offset))
	return true;

	// 12-bit unsigned offset
	unsigned shift = Log2_64(NumBytes);
	if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
	// Must be a multiple of NumBytes (NumBytes is a power of 2)
	(Offset >> shift) << shift == Offset)
	return true;
	return false;
	}

	// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2

	return AM.Scale == 1 \|\| (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
	}

	bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
	// Consider splitting large offset of struct or array.
	return true;
	}

	int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// Operands \| Rt Latency
	// -------------------------------------------
	// Rt, [Xn, Xm] \| 4
	// -------------------------------------------
	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
	// Rt, [Xn, Wm, <extend> #imm] \|
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1 if
	// it is not equal to 0 or 1.
	return AM.Scale != 0 && AM.Scale != 1;
	return -1;
	}

	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	const MCPhysReg *
	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints.
	static const MCPhysReg ScratchRegs[] = {
	AArch64::X16, AArch64::X17, AArch64::LR, 0
	};
	return ScratchRegs;
	}

	bool
	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const {
	N = N->getOperand(0).getNode();
	EVT VT = N->getValueType(0);
	// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
	// it with shift to let it be lowered to UBFX.
	if (N->getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	uint64_t TruncMask = N->getConstantOperandVal(1);
	if (isMask_64(TruncMask) &&
	N->getOperand(0).getOpcode() == ISD::SRL &&
	isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
	return false;
	}
	return true;
	}

	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0)
	return false;

	int64_t Val = Imm.getSExtValue();
	if (Val == 0 \|\| AArch64_AM::isLogicalImmediate(Val, BitSize))
	return true;

	if ((int64_t)Val < 0)
	Val = ~Val;
	if (BitSize == 32)
	Val &= (1LL << 32) - 1;

	unsigned LZ = countLeadingZeros((uint64_t)Val);
	unsigned Shift = (63 - LZ) / 16;
	// MOVZ is free so return true for one or fewer MOVK.
	return Shift < 3;
	}

	bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// cmge X, X, #0
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	EVT VT = N->getValueType(0);
	if (!Subtarget->hasNEON() \|\| !VT.isVector())
	return SDValue();

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
	}

	// Generate SUBS and CSEL for integer abs.
	static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
	// and change it to SUB and CSEL.
	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
	if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
	if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
	SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	N0.getOperand(0));
	// Generate SUBS & CSEL.
	SDValue Cmp =
	DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
	N0.getOperand(0), DAG.getConstant(0, DL, VT));
	return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
	DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
	SDValue(Cmp.getNode(), 1));
	}
	return SDValue();
	}

	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	return performIntegerAbsCombine(N, DAG);
	}

	SDValue
	AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	unsigned Lg2 = Divisor.countTrailingZeros();
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);

	// Add (N0 < 0) ? Pow2 - 1 : 0;
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);

	Created.push_back(Cmp.getNode());
	Created.push_back(Add.getNode());
	Created.push_back(CSel.getNode());

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	Created.push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
	}

	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// The below optimizations require a constant RHS.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
	const APInt &ConstValue = C->getAPIntValue();

	// Multiplication of a power of two plus/minus one can be done more
	// cheaply as as shift+add/sub. For now, this is true unilaterally. If
	// future CPUs have a cheaper MADD instruction, this may need to be
	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
	// 64-bit is 5 cycles, so this is always a win.
	// More aggressively, some multiplications N0 * C can be lowered to
	// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
	// e.g. 6=32=(2+1)2.
	// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
	// which equals to (1+2)*16-(1+2).
	SDValue N0 = N->getOperand(0);
	// TrailingZeroes is used to test if the mul can be lowered to
	// shift+add+shift.
	unsigned TrailingZeroes = ConstValue.countTrailingZeros();
	if (TrailingZeroes) {
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into smul or umul.
	if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) \|\|
	isZeroExtended(N0.getNode(), DAG)))
	return SDValue();
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into madd or msub.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD \|\|
	N->use_begin()->getOpcode() == ISD::SUB))
	return SDValue();
	}
	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
	// and shift+add+shift.
	APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);

	unsigned ShiftAmt, AddSubOpc;
	// Is the shifted value the LHS operand of the add/sub?
	bool ShiftValUseIsN0 = true;
	// Do we need to negate the result?
	bool NegateResult = false;

	if (ConstValue.isNonNegative()) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
	APInt SCVMinus1 = ShiftedConstValue - 1;
	APInt CVPlus1 = ConstValue + 1;
	if (SCVMinus1.isPowerOf2()) {
	ShiftAmt = SCVMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	} else if (CVPlus1.isPowerOf2()) {
	ShiftAmt = CVPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	} else
	return SDValue();
	} else {
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
	APInt CVNegPlus1 = -ConstValue + 1;
	APInt CVNegMinus1 = -ConstValue - 1;
	if (CVNegPlus1.isPowerOf2()) {
	ShiftAmt = CVNegPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	ShiftValUseIsN0 = false;
	} else if (CVNegMinus1.isPowerOf2()) {
	ShiftAmt = CVNegMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	NegateResult = true;
	} else
	return SDValue();
	}

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(ShiftAmt, DL, MVT::i64));

	SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
	SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
	SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
	assert(!(NegateResult && TrailingZeroes) &&
	"NegateResult and TrailingZeroes cannot both be true for now.");
	// Negate the result.
	if (NegateResult)
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
	// Shift the result.
	if (TrailingZeroes)
	return DAG.getNode(ISD::SHL, DL, VT, Res,
	DAG.getConstant(TrailingZeroes, DL, MVT::i64));
	return Res;
	}

	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	// First try to optimize away the conversion when it's conditionally from
	// a constant. Vectors only.
	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
	return Res;

	EVT VT = N->getValueType(0);
	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Only optimize when the source and destination types have the same width.
	if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// If the result of an integer load is only used by an integer-to-float
	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
	SDValue N0 = N->getOperand(0);
	if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not change the width of a volatile load.
	!cast<LoadSDNode>(N0)->isVolatile()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), LN0->getAlignment(),
	LN0->getMemOperand()->getFlags());

	// Make sure successors of the original load stay after it by updating them
	// to use the new Chain.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));

	unsigned Opcode =
	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
	return DAG.getNode(Opcode, SDLoc(N), VT, Load);
	}

	return SDValue();
	}

	/// Fold a floating-point multiply by power of two into floating-point to
	/// fixed-point conversion.
	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	if (!N->getValueType(0).isSimple())
	return SDValue();

	SDValue Op = N->getOperand(0);
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	Op.getOpcode() != ISD::FMUL)
	return SDValue();

	SDValue ConstVec = Op->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
	uint32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
	uint32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t Bits = IntBits == 64 ? 64 : 32;
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
	if (C == -1 \|\| C == 0 \|\| C > Bits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	assert((ResTy != MVT::v4i64 \|\| DCI.isBeforeLegalizeOps()) &&
	"Illegal vector type after legalization");

	SDLoc DL(N);
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
	: Intrinsic::aarch64_neon_vcvtfp2fxu;
	SDValue FixConv =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
	Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
	// We can handle smaller integers by generating an extra trunc.
	if (IntBits < FloatBits)
	FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);

	return FixConv;
	}

	/// Fold a floating-point divide by power of two into fixed-point to
	/// floating-point conversion.
	static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	unsigned Opc = Op->getOpcode();
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	!Op.getOperand(0).getValueType().isSimple() \|\|
	(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
	return SDValue();

	SDValue ConstVec = N->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
	int32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
	int32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
	if (C == -1 \|\| C == 0 \|\| C > FloatBits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc DL(N);
	SDValue ConvInput = Op.getOperand(0);
	bool IsSigned = Opc == ISD::SINT_TO_FP;
	if (IntBits < FloatBits)
	ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
	ResTy, ConvInput);

	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
	: Intrinsic::aarch64_neon_vcvtfxu2fp;
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
	DAG.getConstant(C, DL, MVT::i32));
	}

	/// An EXTR instruction is made up of two shifts, ORed together. This helper
	/// searches for and classifies those shifts.
	static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
	bool &FromHi) {
	if (N.getOpcode() == ISD::SHL)
	FromHi = false;
	else if (N.getOpcode() == ISD::SRL)
	FromHi = true;
	else
	return false;

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	return false;

	ShiftAmount = N->getConstantOperandVal(1);
	Src = N->getOperand(0);
	return true;
	}

	/// EXTR instruction extracts a contiguous chunk of bits from two existing
	/// registers viewed as a high/low pair. This function looks for the pattern:
	/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
	/// with an EXTR. Can't quite be done in TableGen because the two immediates
	/// aren't independent.
	static SDValue tryCombineToEXTR(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	assert(N->getOpcode() == ISD::OR && "Unexpected root");

	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	SDValue LHS;
	uint32_t ShiftLHS = 0;
	bool LHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
	return SDValue();

	SDValue RHS;
	uint32_t ShiftRHS = 0;
	bool RHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
	return SDValue();

	// If they're both trying to come from the high part of the register, they're
	// not really an EXTR.
	if (LHSFromHi == RHSFromHi)
	return SDValue();

	if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
	return SDValue();

	if (LHSFromHi) {
	std::swap(LHS, RHS);
	std::swap(ShiftLHS, ShiftRHS);
	}

	return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
	DAG.getConstant(ShiftRHS, DL, MVT::i64));
	}

	static SDValue tryCombineToBSL(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() != ISD::AND)
	return SDValue();

	SDValue N1 = N->getOperand(1);
	if (N1.getOpcode() != ISD::AND)
	return SDValue();

	// We only have to look for constant vectors here since the general, variable
	// case can be handled in TableGen.
	unsigned Bits = VT.getScalarSizeInBits();
	uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
	for (int i = 1; i >= 0; --i)
	for (int j = 1; j >= 0; --j) {
	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
	if (!BVN0 \|\| !BVN1)
	continue;

	bool FoundMatch = true;
	for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
	if (!CN0 \|\| !CN1 \|\|
	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
	FoundMatch = false;
	break;
	}
	}

	if (FoundMatch)
	return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
	N0->getOperand(1 - i), N1->getOperand(1 - j));
	}

	return SDValue();
	}

	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (SDValue Res = tryCombineToEXTR(N, DCI))
	return Res;

	if (SDValue Res = tryCombineToBSL(N, DCI))
	return Res;

	return SDValue();
	}

	static SDValue performANDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue LHS = N->getOperand(0);
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| !DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
	if (!BVN)
	return SDValue();

	// AND does not accept an immediate, so check if we can use a BIC immediate
	// instruction instead. We do this here instead of using a (and x, (mvni imm))
	// pattern in isel, because some immediates may be lowered to the preferred
	// (and x, (movi imm)) form, even though an mvni representation also exists.
	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;

	DefBits = ~DefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
	DefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
	DefBits, &LHS)))
	return NewOp;

	UndefBits = ~UndefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
	UndefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
	UndefBits, &LHS)))
	return NewOp;
	}

	return SDValue();
	}

	static SDValue performSRLCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
	// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
	// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() == ISD::BSWAP) {
	SDLoc DL(N);
	SDValue N1 = N->getOperand(1);
	SDValue N00 = N0.getOperand(0);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	uint64_t ShiftAmt = C->getZExtValue();
	if (VT == MVT::i32 && ShiftAmt == 16 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	if (VT == MVT::i64 && ShiftAmt == 32 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	}
	}
	return SDValue();
	}

	static SDValue performBitcastCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// Remove extraneous bitcasts around an extract_subvector.
	// For example,
	// (v4i16 (bitconvert
	// (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
	// becomes
	// (extract_subvector ((v8i16 ...), (i64 4)))

	// Only interested in 64-bit vectors as the ultimate result.
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();
	if (VT.getSimpleVT().getSizeInBits() != 64)
	return SDValue();
	// Is the operand an extract_subvector starting at the beginning or halfway
	// point of the vector? A low half may also come through as an
	// EXTRACT_SUBREG, so look for that, too.
	SDValue Op0 = N->getOperand(0);
	if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
	!(Op0->isMachineOpcode() &&
	Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
	return SDValue();
	uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
	if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
	return SDValue();
	} else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
	if (idx != AArch64::dsub)
	return SDValue();
	// The dsub reference is equivalent to a lane zero subvector reference.
	idx = 0;
	}
	// Look through the bitcast of the input to the extract.
	if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
	return SDValue();
	SDValue Source = Op0->getOperand(0)->getOperand(0);
	// If the source type has twice the number of elements as our destination
	// type, we know this is an extract of the high or low half of the vector.
	EVT SVT = Source->getValueType(0);
	if (!SVT.isVector() \|\|
	SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
	return SDValue();

	LLVM_DEBUG(
	dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");

	// Create the simplified form to just extract the low or high half of the
	// vector directly rather than bothering with the bitcasts.
	SDLoc dl(N);
	unsigned NumElements = VT.getVectorNumElements();
	if (idx) {
	SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
	} else {
	SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
	return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
	Source, SubReg),
	0);
	}
	}

	static SDValue performConcatVectorsCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);

	// Optimize concat_vectors of truncated vectors, where the intermediate
	// type is illegal, to avoid said illegality, e.g.,
	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
	// (v2i16 (truncate (v2i64)))))
	// ->
	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
	// (v4i32 (bitcast (v2i64))),
	// <0, 2, 4, 6>)))
	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
	// on both input and result type, so we might generate worse code.
	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
	if (N->getNumOperands() == 2 &&
	N0->getOpcode() == ISD::TRUNCATE &&
	N1->getOpcode() == ISD::TRUNCATE) {
	SDValue N00 = N0->getOperand(0);
	SDValue N10 = N1->getOperand(0);
	EVT N00VT = N00.getValueType();

	if (N00VT == N10.getValueType() &&
	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
	N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
	SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
	for (size_t i = 0; i < Mask.size(); ++i)
	Mask[i] = i * 2;
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getVectorShuffle(
	MidVT, dl,
	DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
	DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
	}
	}

	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
	// canonicalise to that.
	if (N0 == N1 && VT.getVectorNumElements() == 2) {
	assert(VT.getScalarSizeInBits() == 64);
	return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
	DAG.getConstant(0, dl, MVT::i64));
	}

	// Canonicalise concat_vectors so that the right-hand vector has as few
	// bit-casts as possible before its real operation. The primary matching
	// destination for these operations will be the narrowing "2" instructions,
	// which depend on the operation being performed on this right-hand vector.
	// For example,
	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
	// becomes
	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))

	if (N1->getOpcode() != ISD::BITCAST)
	return SDValue();
	SDValue RHS = N1->getOperand(0);
	MVT RHSTy = RHS.getValueType().getSimpleVT();
	// If the RHS is not a vector, this is not the pattern we're looking for.
	if (!RHSTy.isVector())
	return SDValue();

	LLVM_DEBUG(
	dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");

	MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
	RHSTy.getVectorNumElements() * 2);
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
	DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
	RHS));
	}

	static SDValue tryCombineFixedPointConvert(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait until after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();
	// Transform a scalar conversion of a value from a lane extract into a
	// lane extract of a vector conversion. E.g., from foo1 to foo2:
	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
	//
	// The second form interacts better with instruction selection and the
	// register allocator to avoid cross-class register copies that aren't
	// coalescable due to a lane reference.

	// Check the operand and see if it originates from a lane extract.
	SDValue Op1 = N->getOperand(1);
	if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	// Yep, no additional predication needed. Perform the transform.
	SDValue IID = N->getOperand(0);
	SDValue Shift = N->getOperand(2);
	SDValue Vec = Op1.getOperand(0);
	SDValue Lane = Op1.getOperand(1);
	EVT ResTy = N->getValueType(0);
	EVT VecResTy;
	SDLoc DL(N);

	// The vector width should be 128 bits by the time we get here, even
	// if it started as 64 bits (the extract_vector handling will have
	// done so).
	assert(Vec.getValueSizeInBits() == 128 &&
	"unexpected vector size on extract_vector_elt!");
	if (Vec.getValueType() == MVT::v4i32)
	VecResTy = MVT::v4f32;
	else if (Vec.getValueType() == MVT::v2i64)
	VecResTy = MVT::v2f64;
	else
	llvm_unreachable("unexpected vector type!");

	SDValue Convert =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
	}
	return SDValue();
	}

	// AArch64 high-vector "long" operations are formed by performing the non-high
	// version on an extract_subvector of each operand which gets the high half:
	//
	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
	//
	// However, there are cases which don't have an extract_high explicitly, but
	// have another operation that can be made compatible with one for free. For
	// example:
	//
	// (dupv64 scalar) --> (extract_high (dup128 scalar))
	//
	// This routine does the actual conversion of such DUPs, once outer routines
	// have determined that everything else is in order.
	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
	// similarly here.
	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
	switch (N.getOpcode()) {
	case AArch64ISD::DUP:
	case AArch64ISD::DUPLANE8:
	case AArch64ISD::DUPLANE16:
	case AArch64ISD::DUPLANE32:
	case AArch64ISD::DUPLANE64:
	case AArch64ISD::MOVI:
	case AArch64ISD::MOVIshift:
	case AArch64ISD::MOVIedit:
	case AArch64ISD::MOVImsl:
	case AArch64ISD::MVNIshift:
	case AArch64ISD::MVNImsl:
	break;
	default:
	// FMOV could be supported, but isn't very useful, as it would only occur
	// if you passed a bitcast' floating point immediate to an eligible long
	// integer op (addl, smull, ...).
	return SDValue();
	}

	MVT NarrowTy = N.getSimpleValueType();
	if (!NarrowTy.is64BitVector())
	return SDValue();

	MVT ElementTy = NarrowTy.getVectorElementType();
	unsigned NumElems = NarrowTy.getVectorNumElements();
	MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);

	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
	DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
	DAG.getConstant(NumElems, dl, MVT::i64));
	}

	static bool isEssentiallyExtractHighSubvector(SDValue N) {
	if (N.getOpcode() == ISD::BITCAST)
	N = N.getOperand(0);
	if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;
	return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
	N.getOperand(0).getValueType().getVectorNumElements() / 2;
	}

	/// Helper structure to keep track of ISD::SET_CC operands.
	struct GenericSetCCInfo {
	const SDValue *Opnd0;
	const SDValue *Opnd1;
	ISD::CondCode CC;
	};

	/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
	struct AArch64SetCCInfo {
	const SDValue *Cmp;
	AArch64CC::CondCode CC;
	};

	/// Helper structure to keep track of SetCC information.
	union SetCCInfo {
	GenericSetCCInfo Generic;
	AArch64SetCCInfo AArch64;
	};

	/// Helper structure to be able to read SetCC information. If set to
	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
	/// GenericSetCCInfo.
	struct SetCCInfoAndKind {
	SetCCInfo Info;
	bool IsAArch64;
	};

	/// Check whether or not \p Op is a SET_CC operation, either a generic or
	/// an
	/// AArch64 lowered one.
	/// \p SetCCInfo is filled accordingly.
	/// \post SetCCInfo is meanginfull only when this function returns true.
	/// \return True when Op is a kind of SET_CC operation.
	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
	// If this is a setcc, this is straight forward.
	if (Op.getOpcode() == ISD::SETCC) {
	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SetCCInfo.IsAArch64 = false;
	return true;
	}
	// Otherwise, check if this is a matching csel instruction.
	// In other words:
	// - csel 1, 0, cc
	// - csel 0, 1, !cc
	if (Op.getOpcode() != AArch64ISD::CSEL)
	return false;
	// Set the information about the operands.
	// TODO: we want the operands of the Cmp not the csel
	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
	SetCCInfo.IsAArch64 = true;
	SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// Check that the operands matches the constraints:
	// (1) Both operands must be constants.
	// (2) One must be 1 and the other must be 0.
	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));

	// Check (1).
	if (!TValue \|\| !FValue)
	return false;

	// Check (2).
	if (!TValue->isOne()) {
	// Update the comparison when we are interested in !cc.
	std::swap(TValue, FValue);
	SetCCInfo.Info.AArch64.CC =
	AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
	}
	return TValue->isOne() && FValue->isNullValue();
	}

	// Returns true if Op is setcc or zext of setcc.
	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
	if (isSetCC(Op, Info))
	return true;
	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
	isSetCC(Op->getOperand(0), Info));
	}

	// The folding we want to perform is:
	// (add x, [zext] (setcc cc ...) )
	// -->
	// (csel x, (add x, 1), !cc ...)
	//
	// The latter will get matched to a CSINC instruction.
	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
	SDValue LHS = Op->getOperand(0);
	SDValue RHS = Op->getOperand(1);
	SetCCInfoAndKind InfoAndKind;

	// If neither operand is a SET_CC, give up.
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
	std::swap(LHS, RHS);
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
	return SDValue();
	}

	// FIXME: This could be generatized to work for FP comparisons.
	EVT CmpVT = InfoAndKind.IsAArch64
	? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
	: InfoAndKind.Info.Generic.Opnd0->getValueType();
	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
	return SDValue();

	SDValue CCVal;
	SDValue Cmp;
	SDLoc dl(Op);
	if (InfoAndKind.IsAArch64) {
	CCVal = DAG.getConstant(
	AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
	MVT::i32);
	Cmp = *InfoAndKind.Info.AArch64.Cmp;
	} else
	Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
	*InfoAndKind.Info.Generic.Opnd1,
	ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
	CCVal, DAG, dl);

	EVT VT = Op->getValueType(0);
	LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
	}

	// The basic add/sub long vector instructions have variants with "2" on the end
	// which act on the high-half of their inputs. They are normally matched by
	// patterns like:
	//
	// (add (zeroext (extract_high LHS)),
	// (zeroext (extract_high RHS)))
	// -> uaddl2 vD, vN, vM
	//
	// However, if one of the extracts is something like a duplicate, this
	// instruction can still be used profitably. This function puts the DAG into a
	// more appropriate form for those patterns to trigger.
	static SDValue performAddSubLongCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector()) {
	if (N->getOpcode() == ISD::ADD)
	return performSetccAddFolding(N, DAG);
	return SDValue();
	}

	// Make sure both branches are extended in the same way.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
	LHS.getOpcode() != RHS.getOpcode())
	return SDValue();

	unsigned ExtType = LHS.getOpcode();

	// It's not worth doing if at least one of the inputs isn't already an
	// extract, but we don't know which it'll be so we have to try both.
	if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
	RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
	if (!RHS.getNode())
	return SDValue();

	RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
	} else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
	LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
	if (!LHS.getNode())
	return SDValue();

	LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
	}

	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
	}

	// Massage DAGs which we can use the high-half "long" operations on into
	// something isel will recognize better. E.g.
	//
	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
	// (aarch64_neon_umull (extract_high (v2i64 vec)))
	// (extract_high (v2i64 (dup128 scalar)))))
	//
	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	assert(LHS.getValueType().is64BitVector() &&
	RHS.getValueType().is64BitVector() &&
	"unexpected shape for long operation");

	// Either node could be a DUP, but it's not worth doing both of them (you'd
	// just as well use the non-high version) so look for a corresponding extract
	// operation on the other "wing".
	if (isEssentiallyExtractHighSubvector(LHS)) {
	RHS = tryExtendDUPToExtractHigh(RHS, DAG);
	if (!RHS.getNode())
	return SDValue();
	} else if (isEssentiallyExtractHighSubvector(RHS)) {
	LHS = tryExtendDUPToExtractHigh(LHS, DAG);
	if (!LHS.getNode())
	return SDValue();
	}

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
	N->getOperand(0), LHS, RHS);
	}

	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
	MVT ElemTy = N->getSimpleValueType(0).getScalarType();
	unsigned ElemBits = ElemTy.getSizeInBits();

	int64_t ShiftAmount;
	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElemBits) \|\|
	SplatBitSize != ElemBits)
	return SDValue();

	ShiftAmount = SplatValue.getSExtValue();
	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	ShiftAmount = CVN->getSExtValue();
	} else
	return SDValue();

	unsigned Opcode;
	bool IsRightShift;
	switch (IID) {
	default:
	llvm_unreachable("Unknown shift intrinsic");
	case Intrinsic::aarch64_neon_sqshl:
	Opcode = AArch64ISD::SQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_uqshl:
	Opcode = AArch64ISD::UQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_srshl:
	Opcode = AArch64ISD::SRSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_urshl:
	Opcode = AArch64ISD::URSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_sqshlu:
	Opcode = AArch64ISD::SQSHLU_I;
	IsRightShift = false;
	break;
	}

	if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(-ShiftAmount, dl, MVT::i32));
	} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(ShiftAmount, dl, MVT::i32));
	}

	return SDValue();
	}

	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
	// the intrinsics must be legal and take an i32, this means there's almost
	// certainly going to be a zext in the DAG which we can eliminate.
	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
	SDValue AndN = N->getOperand(2);
	if (AndN.getOpcode() != ISD::AND)
	return SDValue();

	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
	if (!CMask \|\| CMask->getZExtValue() != Mask)
	return SDValue();

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
	N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
	}

	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
	DAG.getNode(Opc, dl,
	N->getOperand(1).getSimpleValueType(),
	N->getOperand(1)),
	DAG.getConstant(0, dl, MVT::i64));
	}

	static SDValue performIntrinsicCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;
	unsigned IID = getIntrinsicID(N);
	switch (IID) {
	default:
	break;
	case Intrinsic::aarch64_neon_vcvtfxs2fp:
	case Intrinsic::aarch64_neon_vcvtfxu2fp:
	return tryCombineFixedPointConvert(N, DCI, DAG);
	case Intrinsic::aarch64_neon_saddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
	case Intrinsic::aarch64_neon_uaddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
	case Intrinsic::aarch64_neon_sminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
	case Intrinsic::aarch64_neon_uminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
	case Intrinsic::aarch64_neon_smaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
	case Intrinsic::aarch64_neon_umaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
	case Intrinsic::aarch64_neon_fmax:
	return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmin:
	return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmaxnm:
	return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fminnm:
	return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_smull:
	case Intrinsic::aarch64_neon_umull:
	case Intrinsic::aarch64_neon_pmull:
	case Intrinsic::aarch64_neon_sqdmull:
	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
	case Intrinsic::aarch64_neon_sqshl:
	case Intrinsic::aarch64_neon_uqshl:
	case Intrinsic::aarch64_neon_sqshlu:
	case Intrinsic::aarch64_neon_srshl:
	case Intrinsic::aarch64_neon_urshl:
	return tryCombineShiftImm(IID, N, DAG);
	case Intrinsic::aarch64_crc32b:
	case Intrinsic::aarch64_crc32cb:
	return tryCombineCRC32(0xff, N, DAG);
	case Intrinsic::aarch64_crc32h:
	case Intrinsic::aarch64_crc32ch:
	return tryCombineCRC32(0xffff, N, DAG);
	}
	return SDValue();
	}

	static SDValue performExtendCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
	// we can convert that DUP into another extract_high (of a bigger DUP), which
	// helps the backend to decide that an sabdl2 would be useful, saving a real
	// extract_high operation.
	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
	SDNode *ABDNode = N->getOperand(0).getNode();
	unsigned IID = getIntrinsicID(ABDNode);
	if (IID == Intrinsic::aarch64_neon_sabd \|\|
	IID == Intrinsic::aarch64_neon_uabd) {
	SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
	if (!NewABD.getNode())
	return SDValue();

	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
	NewABD);
	}
	}

	// This is effectively a custom type legalization for AArch64.
	//
	// Type legalization will split an extend of a small, legal, type to a larger
	// illegal type by first splitting the destination type, often creating
	// illegal source types, which then get legalized in isel-confusing ways,
	// leading to really terrible codegen. E.g.,
	// %result = v8i32 sext v8i8 %value
	// becomes
	// %losrc = extract_subreg %value, ...
	// %hisrc = extract_subreg %value, ...
	// %lo = v4i32 sext v4i8 %losrc
	// %hi = v4i32 sext v4i8 %hisrc
	// Things go rapidly downhill from there.
	//
	// For AArch64, the [sz]ext vector instructions can only go up one element
	// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
	// take two instructions.
	//
	// This implies that the most efficient way to do the extend from v8i8
	// to two v4i32 values is to first extend the v8i8 to v8i16, then do
	// the normal splitting to happen for the v8i16->v8i32.

	// This is pre-legalization to catch some cases where the default
	// type legalization will create ill-tempered code.
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// We're only interested in cleaning things up for non-legal vector types
	// here. If both the source and destination are legal, things will just
	// work naturally without any fiddling.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT ResVT = N->getValueType(0);
	if (!ResVT.isVector() \|\| TLI.isTypeLegal(ResVT))
	return SDValue();
	// If the vector type isn't a simple VT, it's beyond the scope of what
	// we're worried about here. Let legalization do its thing and hope for
	// the best.
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src->getValueType(0);
	if (!ResVT.isSimple() \|\| !SrcVT.isSimple())
	return SDValue();

	// If the source VT is a 64-bit vector, we can play games and get the
	// better results we want.
	if (SrcVT.getSizeInBits() != 64)
	return SDValue();

	unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
	unsigned ElementCount = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
	SDLoc DL(N);
	Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);

	// Now split the rest of the operation into two halves, each with a 64
	// bit source.
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	unsigned NumElements = ResVT.getVectorNumElements();
	assert(!(NumElements & 1) && "Splitting vector, but not in half!");
	LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
	ResVT.getVectorElementType(), NumElements / 2);

	EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
	LoVT.getVectorNumElements());
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(0, DL, MVT::i64));
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);

	// Now combine the parts back together so we still have a single result
	// like the combiner expects.
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
	}

	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
	SDValue SplatVal, unsigned NumVecElts) {
	assert(!St.isTruncatingStore() && "cannot split truncating vector store");
	unsigned OrigAlignment = St.getAlignment();
	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;

	// Create scalar stores. This is at least as good as the code sequence for a
	// split unaligned store which is a dup.s, ext.b, and two stores.
	// Most of the time the three stores should be replaced by store pair
	// instructions (stp).
	SDLoc DL(&St);
	SDValue BasePtr = St.getBasePtr();
	uint64_t BaseOffset = 0;

	const MachinePointerInfo &PtrInfo = St.getPointerInfo();
	SDValue NewST1 =
	DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
	OrigAlignment, St.getMemOperand()->getFlags());

	// As this in ISel, we will not merge this add which may degrade results.
	if (BasePtr->getOpcode() == ISD::ADD &&
	isa<ConstantSDNode>(BasePtr->getOperand(1))) {
	BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
	BasePtr = BasePtr->getOperand(0);
	}

	unsigned Offset = EltOffset;
	while (--NumVecElts) {
	unsigned Alignment = MinAlign(OrigAlignment, Offset);
	SDValue OffsetPtr =
	DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
	NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
	PtrInfo.getWithOffset(Offset), Alignment,
	St.getMemOperand()->getFlags());
	Offset += EltOffset;
	}
	return NewST1;
	}

	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
	/// load store optimizer pass will merge them to store pair stores. This should
	/// be better than a movi to create the vector zero followed by a vector store
	/// if the zero constant is not re-used, since one instructions and one register
	/// live range will be removed.
	///
	/// For example, the final generated code should be:
	///
	/// stp xzr, xzr, [x0]
	///
	/// instead of:
	///
	/// movi v0.2d, #0
	/// str q0, [x0]
	///
	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
	// 2, 3 or 4 i32 elements.
	int NumVecElts = VT.getVectorNumElements();
	if (!(((NumVecElts == 2 \|\| NumVecElts == 3) &&
	VT.getVectorElementType().getSizeInBits() == 64) \|\|
	((NumVecElts == 2 \|\| NumVecElts == 3 \|\| NumVecElts == 4) &&
	VT.getVectorElementType().getSizeInBits() == 32)))
	return SDValue();

	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// If the zero constant has more than one use then the vector store could be
	// better since the constant mov will be amortized and stp q instructions
	// should be able to be formed.
	if (!StVal.hasOneUse())
	return SDValue();

	// If the store is truncating then it's going down to i16 or smaller, which
	// means it can be implemented in a single store anyway.
	if (St.isTruncatingStore())
	return SDValue();

	// If the immediate offset of the address operand is too large for the stp
	// instruction, then bail out.
	if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
	int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
	if (Offset < -512 \|\| Offset > 504)
	return SDValue();
	}

	for (int I = 0; I < NumVecElts; ++I) {
	SDValue EltVal = StVal.getOperand(I);
	if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
	return SDValue();
	}

	// Use a CopyFromReg WZR/XZR here to prevent
	// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
	SDLoc DL(&St);
	unsigned ZeroReg;
	EVT ZeroVT;
	if (VT.getVectorElementType().getSizeInBits() == 32) {
	ZeroReg = AArch64::WZR;
	ZeroVT = MVT::i32;
	} else {
	ZeroReg = AArch64::XZR;
	ZeroVT = MVT::i64;
	}
	SDValue SplatVal =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
	/// value. The load store optimizer pass will merge them to store pair stores.
	/// This has better performance than a splat of the scalar followed by a split
	/// vector store. Even if the stores are not merged it is four stores vs a dup,
	/// followed by an ext.b and two stores.
	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// Don't replace floating point stores, they possibly won't be transformed to
	// stp because of the store pair suppress pass.
	if (VT.isFloatingPoint())
	return SDValue();

	// We can express a splat as store pair(s) for 2 or 4 elements.
	unsigned NumVecElts = VT.getVectorNumElements();
	if (NumVecElts != 4 && NumVecElts != 2)
	return SDValue();

	// If the store is truncating then it's going down to i16 or smaller, which
	// means it can be implemented in a single store anyway.
	if (St.isTruncatingStore())
	return SDValue();

	// Check that this is a splat.
	// Make sure that each of the relevant vector element locations are inserted
	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
	std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
	SDValue SplatVal;
	for (unsigned I = 0; I < NumVecElts; ++I) {
	// Check for insert vector elements.
	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	// Check that same value is inserted at each vector element.
	if (I == 0)
	SplatVal = StVal.getOperand(1);
	else if (StVal.getOperand(1) != SplatVal)
	return SDValue();

	// Check insert element index.
	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
	if (!CIndex)
	return SDValue();
	uint64_t IndexVal = CIndex->getZExtValue();
	if (IndexVal >= NumVecElts)
	return SDValue();
	IndexNotInserted.reset(IndexVal);

	StVal = StVal.getOperand(0);
	}
	// Check that all vector element locations were inserted to.
	if (IndexNotInserted.any())
	return SDValue();

	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {

	StoreSDNode *S = cast<StoreSDNode>(N);
	if (S->isVolatile() \|\| S->isIndexed())
	return SDValue();

	SDValue StVal = S->getValue();
	EVT VT = StVal.getValueType();
	if (!VT.isVector())
	return SDValue();

	// If we get a splat of zeros, convert this vector store to a store of
	// scalars. They will be merged into store pairs of xzr thereby removing one
	// instruction and one register.
	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
	return ReplacedZeroSplat;

	// FIXME: The logic for deciding if an unaligned store should be split should
	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
	// a call to that function here.

	if (!Subtarget->isMisaligned128StoreSlow())
	return SDValue();

	// Don't split at -Oz.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
	// those up regresses performance on micro-benchmarks and olden/bh.
	if (VT.getVectorNumElements() < 2 \|\| VT == MVT::v2i64)
	return SDValue();

	// Split unaligned 16B stores. They are terrible for performance.
	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
	// extensions can use this to mark that it does not want splitting to happen
	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
	if (VT.getSizeInBits() != 128 \|\| S->getAlignment() >= 16 \|\|
	S->getAlignment() <= 2)
	return SDValue();

	// If we get a splat of a scalar convert this vector store to a store of
	// scalars. They will be merged into store pairs thereby removing two
	// instructions.
	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
	return ReplacedSplat;

	SDLoc DL(S);
	unsigned NumElts = VT.getVectorNumElements() / 2;
	// Split VT into two.
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
	SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(0, DL, MVT::i64));
	SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(NumElts, DL, MVT::i64));
	SDValue BasePtr = S->getBasePtr();
	SDValue NewST1 =
	DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
	S->getAlignment(), S->getMemOperand()->getFlags());
	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(8, DL, MVT::i64));
	return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
	S->getPointerInfo(), S->getAlignment(),
	S->getMemOperand()->getFlags());
	}

	/// Target-specific DAG combine function for post-increment LD1 (lane) and
	/// post-increment LD1R.
	static SDValue performPostLD1Combine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	bool IsLaneOp) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	unsigned LoadIdx = IsLaneOp ? 1 : 0;
	SDNode *LD = N->getOperand(LoadIdx).getNode();
	// If it is not LOAD, can not do such combine.
	if (LD->getOpcode() != ISD::LOAD)
	return SDValue();

	// The vector lane must be a constant in the LD1LANE opcode.
	SDValue Lane;
	if (IsLaneOp) {
	Lane = N->getOperand(2);
	auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
	if (!LaneC \|\| LaneC->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();
	}

	LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
	EVT MemVT = LoadSDN->getMemoryVT();
	// Check if memory operand is the same type as the vector element.
	if (MemVT != VT.getVectorElementType())
	return SDValue();

	// Check if there are other uses. If so, do not combine as it will introduce
	// an extra load.
	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
	++UI) {
	if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
	continue;
	if (*UI != N)
	return SDValue();
	}

	SDValue Addr = LD->getOperand(1);
	SDValue Vector = N->getOperand(0);
	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
	Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD
	\|\| UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = VT.getScalarSizeInBits() / 8;
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}

	// To avoid cycle construction make sure that neither the load nor the add
	// are predecessors to each other or the Vector.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	- Visited.insert(N);
	+ Visited.insert(Addr.getNode());
	Worklist.push_back(User);
	Worklist.push_back(LD);
	Worklist.push_back(Vector.getNode());
	if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	SmallVector<SDValue, 8> Ops;
	Ops.push_back(LD->getOperand(0)); // Chain
	if (IsLaneOp) {
	Ops.push_back(Vector); // The vector to be inserted
	Ops.push_back(Lane); // The lane to be inserted in the vector
	}
	Ops.push_back(Addr);
	Ops.push_back(Inc);

	EVT Tys[3] = { VT, MVT::i64, MVT::Other };
	SDVTList SDTys = DAG.getVTList(Tys);
	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
	MemVT,
	LoadSDN->getMemOperand());

	// Update the uses.
	SDValue NewResults[] = {
	SDValue(LD, 0), // The result of load
	SDValue(UpdN.getNode(), 2) // Chain
	};
	DCI.CombineTo(LD, NewResults);
	DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
	DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register

	break;
	}
	return SDValue();
	}

	/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
	/// address translation.
	static bool performTBISimplification(SDValue Addr,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	APInt DemandedMask = APInt::getLowBitsSet(64, 56);
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
	DCI.CommitTargetLoweringOpt(TLO);
	return true;
	}
	return false;
	}

	static SDValue performSTORECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
	return Split;

	if (Subtarget->supportsAddressTopByteIgnored() &&
	performTBISimplification(N->getOperand(2), DCI, DAG))
	return SDValue(N, 0);

	return SDValue();
	}


	/// Target-specific DAG combine function for NEON load/store intrinsics
	/// to merge base address updates.
	static SDValue performNEONPostLDSTCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	unsigned AddrOpIdx = N->getNumOperands() - 1;
	SDValue Addr = N->getOperand(AddrOpIdx);

	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD \|\|
	UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load/store. Otherwise, folding
	// it would create a cycle.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Visited.insert(Addr.getNode());
	Worklist.push_back(N);
	Worklist.push_back(User);
	if (SDNode::hasPredecessorHelper(N, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	// Find the new opcode for the updating load/store.
	bool IsStore = false;
	bool IsLaneOp = false;
	bool IsDupOp = false;
	unsigned NewOpc = 0;
	unsigned NumVecs = 0;
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default: llvm_unreachable("unexpected intrinsic for Neon base update");
	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
	NumVecs = 2; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
	NumVecs = 3; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
	NumVecs = 4; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
	NumVecs = 2; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
	NumVecs = 3; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
	NumVecs = 4; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
	NumVecs = 2; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
	NumVecs = 3; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
	NumVecs = 4; IsStore = true; IsLaneOp = true; break;
	}

	EVT VecTy;
	if (IsStore)
	VecTy = N->getOperand(2).getValueType();
	else
	VecTy = N->getValueType(0);

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
	if (IsLaneOp \|\| IsDupOp)
	NumBytes /= VecTy.getVectorNumElements();
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(N->getOperand(0)); // Incoming chain
	// Load lane and store have vector list as input.
	if (IsLaneOp \|\| IsStore)
	for (unsigned i = 2; i < AddrOpIdx; ++i)
	Ops.push_back(N->getOperand(i));
	Ops.push_back(Addr); // Base register
	Ops.push_back(Inc);

	// Return Types.
	EVT Tys[6];
	unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
	unsigned n;
	for (n = 0; n < NumResultVecs; ++n)
	Tys[n] = VecTy;
	Tys[n++] = MVT::i64; // Type of write back register
	Tys[n] = MVT::Other; // Type of the chain
	SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));

	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
	MemInt->getMemoryVT(),
	MemInt->getMemOperand());

	// Update the uses.
	std::vector<SDValue> NewResults;
	for (unsigned i = 0; i < NumResultVecs; ++i) {
	NewResults.push_back(SDValue(UpdN.getNode(), i));
	}
	NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
	DCI.CombineTo(N, NewResults);
	DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));

	break;
	}
	return SDValue();
	}

	// Checks to see if the value is the prescribed width and returns information
	// about its extension mode.
	static
	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
	ExtType = ISD::NON_EXTLOAD;
	switch(V.getNode()->getOpcode()) {
	default:
	return false;
	case ISD::LOAD: {
	LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
	if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
	ExtType = LoadNode->getExtensionType();
	return true;
	}
	return false;
	}
	case ISD::AssertSext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::SEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::AssertZext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::ZEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::Constant:
	case ISD::TargetConstant: {
	return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
	1LL << (width - 1);
	}
	}

	return true;
	}

	// This function does a whole lot of voodoo to determine if the tests are
	// equivalent without and with a mask. Essentially what happens is that given a
	// DAG resembling:
	//
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| \| \| \|
	// V V \| +----------+
	// +-------------+ +----+ \| \|
	// \| ADD \| \|0xff\| \| \|
	// +-------------+ +----+ \| \|
	// \| \| \| \|
	// V V \| \|
	// +-------------+ \| \|
	// \| AND \| \| \|
	// +-------------+ \| \|
	// \| \| \|
	// +-----+ \| \|
	// \| \| \|
	// V V V
	// +-------------+
	// \| CMP \|
	// +-------------+
	//
	// The AND node may be safely removed for some combinations of inputs. In
	// particular we need to take into account the extension type of the Input,
	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
	// width of the input (this can work for any width inputs, the above graph is
	// specific to 8 bits.
	//
	// The specific equations were worked out by generating output tables for each
	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
	// problem was simplified by working with 4 bit inputs, which means we only
	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
	// patterns present in both extensions (0,7). For every distinct set of
	// AddConstant and CompConstants bit patterns we can consider the masked and
	// unmasked versions to be equivalent if the result of this function is true for
	// all 16 distinct bit patterns of for the current extension type of Input (w0).
	//
	// sub w8, w0, w1
	// and w10, w8, #0x0f
	// cmp w8, w2
	// cset w9, AArch64CC
	// cmp w10, w2
	// cset w11, AArch64CC
	// cmp w9, w11
	// cset w0, eq
	// ret
	//
	// Since the above function shows when the outputs are equivalent it defines
	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
	// would be expensive to run during compiles. The equations below were written
	// in a test harness that confirmed they gave equivalent outputs to the above
	// for all inputs function, so they can be used determine if the removal is
	// legal instead.
	//
	// isEquivalentMaskless() is the code for testing if the AND can be removed
	// factored out of the DAG recognition as the DAG can take several forms.

	static bool isEquivalentMaskless(unsigned CC, unsigned width,
	ISD::LoadExtType ExtType, int AddConstant,
	int CompConstant) {
	// By being careful about our equations and only writing the in term
	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
	// make them generally applicable to all bit widths.
	int MaxUInt = (1 << width);

	// For the purposes of these comparisons sign extending the type is
	// equivalent to zero extending the add and displacing it by half the integer
	// width. Provided we are careful and make sure our equations are valid over
	// the whole range we can just adjust the input and avoid writing equations
	// for sign extended inputs.
	if (ExtType == ISD::SEXTLOAD)
	AddConstant -= (1 << (width-1));

	switch(CC) {
	case AArch64CC::LE:
	case AArch64CC::GT:
	if ((AddConstant == 0) \|\|
	(CompConstant == MaxUInt - 1 && AddConstant < 0) \|\|
	(AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::LT:
	case AArch64CC::GE:
	if ((AddConstant == 0) \|\|
	(AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::HI:
	case AArch64CC::LS:
	if ((AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant >= -1 &&
	CompConstant < AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::PL:
	case AArch64CC::MI:
	if ((AddConstant == 0) \|\|
	(AddConstant > 0 && CompConstant <= 0) \|\|
	(AddConstant < 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::LO:
	case AArch64CC::HS:
	if ((AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant >= 0 &&
	CompConstant <= AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::EQ:
	case AArch64CC::NE:
	if ((AddConstant > 0 && CompConstant < 0) \|\|
	(AddConstant < 0 && CompConstant >= 0 &&
	CompConstant < AddConstant + MaxUInt) \|\|
	(AddConstant >= 0 && CompConstant >= 0 &&
	CompConstant >= AddConstant) \|\|
	(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::VS:
	case AArch64CC::VC:
	case AArch64CC::AL:
	case AArch64CC::NV:
	return true;
	case AArch64CC::Invalid:
	break;
	}

	return false;
	}

	static
	SDValue performCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG, unsigned CCIndex,
	unsigned CmpIndex) {
	unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
	SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
	unsigned CondOpcode = SubsNode->getOpcode();

	if (CondOpcode != AArch64ISD::SUBS)
	return SDValue();

	// There is a SUBS feeding this condition. Is it fed by a mask we can
	// use?

	SDNode *AndNode = SubsNode->getOperand(0).getNode();
	unsigned MaskBits = 0;

	if (AndNode->getOpcode() != ISD::AND)
	return SDValue();

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
	uint32_t CNV = CN->getZExtValue();
	if (CNV == 255)
	MaskBits = 8;
	else if (CNV == 65535)
	MaskBits = 16;
	}

	if (!MaskBits)
	return SDValue();

	SDValue AddValue = AndNode->getOperand(0);

	if (AddValue.getOpcode() != ISD::ADD)
	return SDValue();

	// The basic dag structure is correct, grab the inputs and validate them.

	SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
	SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
	SDValue SubsInputValue = SubsNode->getOperand(1);

	// The mask is present and the provenance of all the values is a smaller type,
	// lets see if the mask is superfluous.

	if (!isa<ConstantSDNode>(AddInputValue2.getNode()) \|\|
	!isa<ConstantSDNode>(SubsInputValue.getNode()))
	return SDValue();

	ISD::LoadExtType ExtType;

	if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue2, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
	return SDValue();

	if(!isEquivalentMaskless(CC, MaskBits, ExtType,
	cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
	cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
	return SDValue();

	// The AND is not necessary, remove it.

	SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
	SubsNode->getValueType(1));
	SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };

	SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
	DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());

	return SDValue(N, 0);
	}

	// Optimize compare with zero and branch.
	static SDValue performBRCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
	// will not be produced, as they are conditional branch instructions that do
	// not set flags.
	if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
	return SDValue();

	if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
	N = NV.getNode();
	SDValue Chain = N->getOperand(0);
	SDValue Dest = N->getOperand(1);
	SDValue CCVal = N->getOperand(2);
	SDValue Cmp = N->getOperand(3);

	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
	unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
	return SDValue();

	unsigned CmpOpc = Cmp.getOpcode();
	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
	return SDValue();

	// Only attempt folding if there is only one use of the flag and no use of the
	// value.
	if (!Cmp->hasNUsesOfValue(0, 0) \|\| !Cmp->hasNUsesOfValue(1, 1))
	return SDValue();

	SDValue LHS = Cmp.getOperand(0);
	SDValue RHS = Cmp.getOperand(1);

	assert(LHS.getValueType() == RHS.getValueType() &&
	"Expected the value type to be the same for both operands!");
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return SDValue();

	if (isNullConstant(LHS))
	std::swap(LHS, RHS);

	if (!isNullConstant(RHS))
	return SDValue();

	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
	LHS.getOpcode() == ISD::SRL)
	return SDValue();

	// Fold the compare into the branch instruction.
	SDValue BR;
	if (CC == AArch64CC::EQ)
	BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
	else
	BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);

	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, BR, false);

	return SDValue();
	}

	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
	// as well as whether the test should be inverted. This code is required to
	// catch these cases (as opposed to standard dag combines) because
	// AArch64ISD::TBZ is matched during legalization.
	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
	SelectionDAG &DAG) {

	if (!Op->hasOneUse())
	return Op;

	// We don't handle undef/constant-fold cases below, as they should have
	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
	// etc.)

	// (tbz (trunc x), b) -> (tbz x, b)
	// This case is just here to enable more of the below cases to be caught.
	if (Op->getOpcode() == ISD::TRUNCATE &&
	Bit < Op->getValueType(0).getSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
	if (Op->getOpcode() == ISD::ANY_EXTEND &&
	Bit < Op->getOperand(0).getValueSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	if (Op->getNumOperands() != 2)
	return Op;

	auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!C)
	return Op;

	switch (Op->getOpcode()) {
	default:
	return Op;

	// (tbz (and x, m), b) -> (tbz x, b)
	case ISD::AND:
	if ((C->getZExtValue() >> Bit) & 1)
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	return Op;

	// (tbz (shl x, c), b) -> (tbz x, b-c)
	case ISD::SHL:
	if (C->getZExtValue() <= Bit &&
	(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit - C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
	case ISD::SRA:
	Bit = Bit + C->getZExtValue();
	if (Bit >= Op->getValueType(0).getSizeInBits())
	Bit = Op->getValueType(0).getSizeInBits() - 1;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

	// (tbz (srl x, c), b) -> (tbz x, b+c)
	case ISD::SRL:
	if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit + C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (xor x, -1), b) -> (tbnz x, b)
	case ISD::XOR:
	if ((C->getZExtValue() >> Bit) & 1)
	Invert = !Invert;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	}

	// Optimize test single bit zero/non-zero and branch.
	static SDValue performTBZCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
	bool Invert = false;
	SDValue TestSrc = N->getOperand(1);
	SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);

	if (TestSrc == NewTestSrc)
	return SDValue();

	unsigned NewOpc = N->getOpcode();
	if (Invert) {
	if (NewOpc == AArch64ISD::TBZ)
	NewOpc = AArch64ISD::TBNZ;
	else {
	assert(NewOpc == AArch64ISD::TBNZ);
	NewOpc = AArch64ISD::TBZ;
	}
	}

	SDLoc DL(N);
	return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
	DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
	}

	// vselect (v1i1 setcc) ->
	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
	// such VSELECT.
	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT CCVT = N0.getValueType();

	if (N0.getOpcode() != ISD::SETCC \|\| CCVT.getVectorNumElements() != 1 \|\|
	CCVT.getVectorElementType() != MVT::i1)
	return SDValue();

	EVT ResVT = N->getValueType(0);
	EVT CmpVT = N0.getOperand(0).getValueType();
	// Only combine when the result type is of the same size as the compared
	// operands.
	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
	return SDValue();

	SDValue IfTrue = N->getOperand(1);
	SDValue IfFalse = N->getOperand(2);
	SDValue SetCC =
	DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
	N0.getOperand(0), N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
	IfTrue, IfFalse);
	}

	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
	/// the compare-mask instructions rather than going via NZCV, even if LHS and
	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
	/// with a vector one followed by a DUP shuffle on the result.
	static SDValue performSelectCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT ResVT = N->getValueType(0);

	if (N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
	// scalar SetCCResultType. We also don't expect vectors, because we assume
	// that selects fed by vector SETCCs are canonicalized to VSELECT.
	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
	"Scalar-SETCC feeding SELECT has unexpected result type!");

	// If NumMaskElts == 0, the comparison is larger than select result. The
	// largest real NEON comparison is 64-bits per lane, which means the result is
	// at most 32-bits and an illegal vector. Just bail out for now.
	EVT SrcVT = N0.getOperand(0).getValueType();

	// Don't try to do this optimization when the setcc itself has i1 operands.
	// There are no legal vectors of i1, so this would be pointless.
	if (SrcVT == MVT::i1)
	return SDValue();

	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
	if (!ResVT.isVector() \|\| NumMaskElts == 0)
	return SDValue();

	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();

	// Also bail out if the vector CCVT isn't the same size as ResVT.
	// This can happen if the SETCC operand size doesn't divide the ResVT size
	// (e.g., f64 vs v3f32).
	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
	return SDValue();

	// Make sure we didn't create illegal types, if we're not supposed to.
	assert(DCI.isBeforeLegalize() \|\|
	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));

	// First perform a vector comparison, where lane 0 is the one we're interested
	// in.
	SDLoc DL(N0);
	SDValue LHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
	SDValue RHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
	SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));

	// Now duplicate the comparison mask we want across all other lanes.
	SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
	SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
	Mask = DAG.getNode(ISD::BITCAST, DL,
	ResVT.changeVectorElementTypeToInteger(), Mask);

	return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
	}

	/// Get rid of unnecessary NVCASTs (that don't change the type).
	static SDValue performNVCASTCombine(SDNode *N) {
	if (N->getValueType(0) == N->getOperand(0).getValueType())
	return N->getOperand(0);

	return SDValue();
	}

	// If all users of the globaladdr are of the form (globaladdr + constant), find
	// the smallest constant, fold it into the globaladdr's offset and rewrite the
	// globaladdr as (globaladdr + constant) - constant.
	static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget,
	const TargetMachine &TM) {
	auto *GN = cast<GlobalAddressSDNode>(N);
	if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
	AArch64II::MO_NO_FLAG)
	return SDValue();

	uint64_t MinOffset = -1ull;
	for (SDNode *N : GN->uses()) {
	if (N->getOpcode() != ISD::ADD)
	return SDValue();
	auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
	if (!C)
	C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	MinOffset = std::min(MinOffset, C->getZExtValue());
	}
	uint64_t Offset = MinOffset + GN->getOffset();

	// Require that the new offset is larger than the existing one. Otherwise, we
	// can end up oscillating between two possible DAGs, for example,
	// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
	if (Offset <= uint64_t(GN->getOffset()))
	return SDValue();

	// Check whether folding this offset is legal. It must not go out of bounds of
	// the referenced object to avoid violating the code model, and must be
	// smaller than 2^21 because this is the largest offset expressible in all
	// object formats.
	//
	// This check also prevents us from folding negative offsets, which will end
	// up being treated in the same way as large positive ones. They could also
	// cause code model violations, and aren't really common enough to matter.
	if (Offset >= (1 << 21))
	return SDValue();

	const GlobalValue *GV = GN->getGlobal();
	Type *T = GV->getValueType();
	if (!T->isSized() \|\|
	Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
	return SDValue();

	SDLoc DL(GN);
	SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
	return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
	DAG.getConstant(MinOffset, DL, MVT::i64));
	}

	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default:
	LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
	break;
	case ISD::ADD:
	case ISD::SUB:
	return performAddSubLongCombine(N, DCI, DAG);
	case ISD::XOR:
	return performXorCombine(N, DAG, DCI, Subtarget);
	case ISD::MUL:
	return performMulCombine(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return performIntToFpCombine(N, DAG, Subtarget);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return performFpToIntCombine(N, DAG, DCI, Subtarget);
	case ISD::FDIV:
	return performFDivCombine(N, DAG, DCI, Subtarget);
	case ISD::OR:
	return performORCombine(N, DCI, Subtarget);
	case ISD::AND:
	return performANDCombine(N, DCI);
	case ISD::SRL:
	return performSRLCombine(N, DCI);
	case ISD::INTRINSIC_WO_CHAIN:
	return performIntrinsicCombine(N, DCI, Subtarget);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	return performExtendCombine(N, DCI, DAG);
	case ISD::BITCAST:
	return performBitcastCombine(N, DCI, DAG);
	case ISD::CONCAT_VECTORS:
	return performConcatVectorsCombine(N, DCI, DAG);
	case ISD::SELECT:
	return performSelectCombine(N, DCI);
	case ISD::VSELECT:
	return performVSelectCombine(N, DCI.DAG);
	case ISD::LOAD:
	if (performTBISimplification(N->getOperand(1), DCI, DAG))
	return SDValue(N, 0);
	break;
	case ISD::STORE:
	return performSTORECombine(N, DCI, DAG, Subtarget);
	case AArch64ISD::BRCOND:
	return performBRCONDCombine(N, DCI, DAG);
	case AArch64ISD::TBNZ:
	case AArch64ISD::TBZ:
	return performTBZCombine(N, DCI, DAG);
	case AArch64ISD::CSEL:
	return performCONDCombine(N, DCI, DAG, 2, 3);
	case AArch64ISD::DUP:
	return performPostLD1Combine(N, DCI, false);
	case AArch64ISD::NVCAST:
	return performNVCASTCombine(N);
	case ISD::INSERT_VECTOR_ELT:
	return performPostLD1Combine(N, DCI, true);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN:
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r:
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane:
	return performNEONPostLDSTCombine(N, DCI, DAG);
	default:
	break;
	}
	break;
	case ISD::GlobalAddress:
	return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
	}
	return SDValue();
	}

	// Check if the return value is used as only a return value, as otherwise
	// we can't perform a tail-call. In particular, we need to check for
	// target ISD nodes that are returns and any other "odd" constructs
	// that the generic analysis code won't necessarily catch.
	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
	SDValue &Chain) const {
	if (N->getNumValues() != 1)
	return false;
	if (!N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
	MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode *Node : Copy->uses()) {
	if (Node->getOpcode() != AArch64ISD::RET_FLAG)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	// Return whether the an instruction can potentially be optimized to a tail
	// call. This will cause the optimizers to attempt to move, or duplicate,
	// return instructions to help enable tail call optimizations for this
	// instruction.
	bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	return CI->isTailCall();
	}

	bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	bool &IsInc,
	SelectionDAG &DAG) const {
	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
	return false;

	Base = Op->getOperand(0);
	// All of the indexed addressing mode instructions take a signed
	// 9 bit immediate offset.
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
	int64_t RHSC = RHS->getSExtValue();
	if (Op->getOpcode() == ISD::SUB)
	RHSC = -(uint64_t)RHSC;
	if (!isInt<9>(RHSC))
	return false;
	IsInc = (Op->getOpcode() == ISD::ADD);
	Offset = Op->getOperand(1);
	return true;
	}
	return false;
	}

	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
	return false;
	AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
	return true;
	}

	bool AArch64TargetLowering::getPostIndexedAddressParts(
	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
	return false;
	// Post-indexing updates the base, so it's not a valid transform
	// if that's not the same as the load's pointer.
	if (Ptr != Base)
	return false;
	AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
	return true;
	}

	static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(0);

	if (N->getValueType(0) != MVT::i16 \|\| Op.getValueType() != MVT::f16)
	return;

	Op = SDValue(
	DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
	DAG.getUNDEF(MVT::i32), Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
	}

	static void ReplaceReductionResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG, unsigned InterOp,
	unsigned AcrossOp) {
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
	SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
	Results.push_back(SplitVal);
	}

	static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
	DAG.getNode(ISD::SRL, DL, MVT::i128, N,
	DAG.getConstant(64, DL, MVT::i64)));
	return std::make_pair(Lo, Hi);
	}

	// Create an even/odd pair of X registers holding integer value V.
	static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
	SDLoc dl(V.getNode());
	SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
	SDValue VHi = DAG.getAnyExtOrTrunc(
	DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
	dl, MVT::i64);
	if (DAG.getDataLayout().isBigEndian())
	std::swap (VLo, VHi);
	SDValue RegClass =
	DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
	SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
	SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
	const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
	return SDValue(
	DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
	}

	static void ReplaceCMP_SWAP_128Results(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	assert(N->getValueType(0) == MVT::i128 &&
	"AtomicCmpSwap on types less than 128 should be legal");

	if (Subtarget->hasLSE()) {
	// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
	// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
	SDValue Ops[] = {
	createGPRPairNode(DAG, N->getOperand(2)), // Compare value
	createGPRPairNode(DAG, N->getOperand(3)), // Store value
	N->getOperand(1), // Ptr
	N->getOperand(0), // Chain in
	};

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();

	unsigned Opcode;
	switch (MemOp->getOrdering()) {
	case AtomicOrdering::Monotonic:
	Opcode = AArch64::CASPX;
	break;
	case AtomicOrdering::Acquire:
	Opcode = AArch64::CASPAX;
	break;
	case AtomicOrdering::Release:
	Opcode = AArch64::CASPLX;
	break;
	case AtomicOrdering::AcquireRelease:
	case AtomicOrdering::SequentiallyConsistent:
	Opcode = AArch64::CASPALX;
	break;
	default:
	llvm_unreachable("Unexpected ordering!");
	}

	MachineSDNode *CmpSwap = DAG.getMachineNode(
	Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
	DAG.setNodeMemRefs(CmpSwap, {MemOp});

	unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
	if (DAG.getDataLayout().isBigEndian())
	std::swap(SubReg1, SubReg2);
	Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
	SDValue(CmpSwap, 0)));
	Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
	SDValue(CmpSwap, 0)));
	Results.push_back(SDValue(CmpSwap, 1)); // Chain out
	return;
	}

	auto Desired = splitInt128(N->getOperand(2), DAG);
	auto New = splitInt128(N->getOperand(3), DAG);
	SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
	New.first, New.second, N->getOperand(0)};
	SDNode *CmpSwap = DAG.getMachineNode(
	AArch64::CMP_SWAP_128, SDLoc(N),
	DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
	DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});

	Results.push_back(SDValue(CmpSwap, 0));
	Results.push_back(SDValue(CmpSwap, 1));
	Results.push_back(SDValue(CmpSwap, 3));
	}

	void AArch64TargetLowering::ReplaceNodeResults(
	SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom expand this");
	case ISD::BITCAST:
	ReplaceBITCASTResults(N, Results, DAG);
	return;
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
	return;

	case AArch64ISD::SADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
	return;
	case AArch64ISD::UADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
	return;
	case AArch64ISD::SMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
	return;
	case AArch64ISD::UMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
	return;
	case AArch64ISD::SMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
	return;
	case AArch64ISD::UMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
	return;
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT:
	assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
	// Let normal code take care of it by not adding anything to Results.
	return;
	case ISD::ATOMIC_CMP_SWAP:
	ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
	return;
	}
	}

	bool AArch64TargetLowering::useLoadStackGuardNode() const {
	if (Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are three or more FDIVs.
	return 3;
	}

	TargetLoweringBase::LegalizeTypeAction
	AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
	// v4i16, v2i32 instead of to promote.
	if (VT == MVT::v1i8 \|\| VT == MVT::v1i16 \|\| VT == MVT::v1i32 \|\|
	VT == MVT::v1f32)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
	return Size == 128;
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
	return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
	}

	// For the real atomic operations, we have ldxr/stxr up to 128 bits,
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size > 128) return AtomicExpansionKind::None;
	// Nand not supported in LSE.
	if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
	// Leave 128 bits to LLSC.
	return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
	}

	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *AI) const {
	// If subtarget has LSE, leave cmpxchg intact for codegen.
	if (Subtarget->hasLSE())
	return AtomicExpansionKind::None;
	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
	// implement cmpxchg without spilling. If the address being exchanged is also
	// on the stack and close enough to the spill slot, this can lead to a
	// situation where the monitor always gets cleared and the atomic operation
	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
	if (getTargetMachine().getOptLevel() == 0)
	return AtomicExpansionKind::None;
	return AtomicExpansionKind::LLSC;
	}

	Value AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
	bool IsAcquire = isAcquireOrStronger(Ord);

	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
	// intrinsic must return {i64, i64} and we have to recombine them into a
	// single i128 here.
	if (ValTy->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int);

	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");

	Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
	Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
	Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
	Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
	return Builder.CreateOr(
	Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
	}

	Type *Tys[] = { Addr->getType() };
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);

	Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();

	const DataLayout &DL = M->getDataLayout();
	IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
	Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);

	return Builder.CreateBitCast(Trunc, EltTy);
	}

	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
	IRBuilder<> &Builder) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
	}

	Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
	Value Val, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	bool IsRelease = isReleaseOrStronger(Ord);

	// Since the intrinsics must have legal type, the i128 intrinsics take two
	// parameters: "i64, i64". We must marshal Val into the appropriate form
	// before the call.
	if (Val->getType()->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
	Function *Stxr = Intrinsic::getDeclaration(M, Int);
	Type *Int64Ty = Type::getInt64Ty(M->getContext());

	Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
	Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
	}

	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
	Type *Tys[] = { Addr->getType() };
	Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);

	const DataLayout &DL = M->getDataLayout();
	IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
	Val = Builder.CreateBitCast(Val, IntValTy);

	return Builder.CreateCall(Stxr,
	{Builder.CreateZExtOrBitCast(
	Val, Stxr->getFunctionType()->getParamType(0)),
	Addr});
	}

	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
	return Ty->isArrayTy();
	}

	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
	EVT) const {
	return false;
	}

	static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	Function *ThreadPointerFunc =
	Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
	return IRB.CreatePointerCast(
	IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
	Offset),
	IRB.getInt8PtrTy()->getPointerTo(0));
	}

	Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the stack cookie. See the definition
	// of TLS_SLOT_STACK_GUARD in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x28);

	// Fuchsia is similar.
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x10);

	return TargetLowering::getIRStackGuard(IRB);
	}

	void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::Win64);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x48);

	// Fuchsia is similar.
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x8);

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	// Only sink 'and' mask to cmp use block if it is masking a single bit, since
	// this is likely to be fold the and/cmp/br into a single tbz instruction. It
	// may be beneficial to sink in other cases, but we would have to check that
	// the cmp would not get folded into the br to form a cbz for these to be
	// beneficial.
	ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
	if (!Mask)
	return false;
	return Mask->getValue().isPowerOf2();
	+}
	+
	+bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
	+ SDNode *N) const {
	+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	+ !Subtarget->isTargetWindows())
	+ return false;
	+ return true;
	}

	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	// Update IsSplitCSR in AArch64unctionInfo.
	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void AArch64TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (AArch64::GPR64RegClass.contains(*I))
	RC = &AArch64::GPR64RegClass;
	else if (AArch64::FPR64RegClass.contains(*I))
	RC = &AArch64::FPR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on AArch64 is expensive. However, when aggressively
	// optimizing for code size, we prefer to use a div instruction, as it is
	// usually smaller than the alternative sequence.
	// The exception to this is vector division. Since AArch64 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
	// We want inc-of-add for scalars and sub-of-not for vectors.
	return VT.isScalarInteger();
	}

	bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
	return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
	}

	unsigned
	AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
	return getPointerTy(DL).getSizeInBits();

	return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
	}

	void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
	MF.getFrameInfo().computeMaxCallFrameSize(MF);
	TargetLoweringBase::finalizeLowering(MF);
	}

	// Unlike X86, we let frame lowering assign offsets to all catch objects.
	bool AArch64TargetLowering::needsFixedCatchObjects() const {
	return false;
	}
	Index: projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h (revision 351722)
	@@ -1,748 +1,744 @@
	//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----- C++ --==//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that AArch64 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
	#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H

	#include "AArch64.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Instruction.h"

	namespace llvm {

	namespace AArch64ISD {

	enum NodeType : unsigned {
	FIRST_NUMBER = ISD::BUILTIN_OP_END,
	WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
	CALL, // Function call.

	// Produces the full sequence of instructions for getting the thread pointer
	// offset of a variable into X0, using the TLSDesc model.
	TLSDESC_CALLSEQ,
	ADRP, // Page address of a TargetGlobalAddress operand.
	ADR, // ADR
	ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand.
	LOADgot, // Load from automatically generated descriptor (e.g. Global
	// Offset Table, TLS record).
	RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
	BRCOND, // Conditional branch instruction; "b.cond".
	CSEL,
	FCSEL, // Conditional move instruction.
	CSINV, // Conditional select invert.
	CSNEG, // Conditional select negate.
	CSINC, // Conditional select increment.

	// Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
	// ELF.
	THREAD_POINTER,
	ADC,
	SBC, // adc, sbc instructions

	// Arithmetic instructions which write flags.
	ADDS,
	SUBS,
	ADCS,
	SBCS,
	ANDS,

	// Conditional compares. Operands: left,right,falsecc,cc,flags
	CCMP,
	CCMN,
	FCCMP,

	// Floating point comparison
	FCMP,

	// Scalar extract
	EXTR,

	// Scalar-to-vector duplication
	DUP,
	DUPLANE8,
	DUPLANE16,
	DUPLANE32,
	DUPLANE64,

	// Vector immedate moves
	MOVI,
	MOVIshift,
	MOVIedit,
	MOVImsl,
	FMOV,
	MVNIshift,
	MVNImsl,

	// Vector immediate ops
	BICi,
	ORRi,

	// Vector bit select: similar to ISD::VSELECT but not all bits within an
	// element must be identical.
	BSL,

	// Vector arithmetic negation
	NEG,

	// Vector shuffles
	ZIP1,
	ZIP2,
	UZP1,
	UZP2,
	TRN1,
	TRN2,
	REV16,
	REV32,
	REV64,
	EXT,

	// Vector shift by scalar
	VSHL,
	VLSHR,
	VASHR,

	// Vector shift by scalar (again)
	SQSHL_I,
	UQSHL_I,
	SQSHLU_I,
	SRSHR_I,
	URSHR_I,

	// Vector comparisons
	CMEQ,
	CMGE,
	CMGT,
	CMHI,
	CMHS,
	FCMEQ,
	FCMGE,
	FCMGT,

	// Vector zero comparisons
	CMEQz,
	CMGEz,
	CMGTz,
	CMLEz,
	CMLTz,
	FCMEQz,
	FCMGEz,
	FCMGTz,
	FCMLEz,
	FCMLTz,

	// Vector across-lanes addition
	// Only the lower result lane is defined.
	SADDV,
	UADDV,

	// Vector across-lanes min/max
	// Only the lower result lane is defined.
	SMINV,
	UMINV,
	SMAXV,
	UMAXV,

	// Vector bitwise negation
	NOT,

	// Vector bitwise selection
	BIT,

	// Compare-and-branch
	CBZ,
	CBNZ,
	TBZ,
	TBNZ,

	// Tail calls
	TC_RETURN,

	// Custom prefetch handling
	PREFETCH,

	// {s\|u}int to FP within a FP register.
	SITOF,
	UITOF,

	/// Natural vector cast. ISD::BITCAST is not natural in the big-endian
	/// world w.r.t vectors; which causes additional REV instructions to be
	/// generated to compensate for the byte-swapping. But sometimes we do
	/// need to re-interpret the data in SIMD vector registers in big-endian
	/// mode without emitting such REV instructions.
	NVCAST,

	SMULL,
	UMULL,

	// Reciprocal estimates and steps.
	FRECPE, FRECPS,
	FRSQRTE, FRSQRTS,

	// NEON Load/Store with post-increment base updates
	LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
	LD3post,
	LD4post,
	ST2post,
	ST3post,
	ST4post,
	LD1x2post,
	LD1x3post,
	LD1x4post,
	ST1x2post,
	ST1x3post,
	ST1x4post,
	LD1DUPpost,
	LD2DUPpost,
	LD3DUPpost,
	LD4DUPpost,
	LD1LANEpost,
	LD2LANEpost,
	LD3LANEpost,
	LD4LANEpost,
	ST2LANEpost,
	ST3LANEpost,
	ST4LANEpost,

	STG,
	STZG,
	ST2G,
	STZ2G

	};

	} // end namespace AArch64ISD

	namespace {

	// Any instruction that defines a 32-bit result zeros out the high half of the
	// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
	// be copying from a truncate. But any other 32-bit operation will zero-extend
	// up to 64 bits.
	// FIXME: X86 also checks for CMOV here. Do we need something similar?
	static inline bool isDef32(const SDNode &N) {
	unsigned Opc = N.getOpcode();
	return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
	Opc != ISD::CopyFromReg;
	}

	} // end anonymous namespace

	class AArch64Subtarget;
	class AArch64TargetMachine;

	class AArch64TargetLowering : public TargetLowering {
	public:
	explicit AArch64TargetLowering(const TargetMachine &TM,
	const AArch64Subtarget &STI);

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;

	/// Determine which of the bits specified in Mask are known to be either zero
	/// or one and return them in the KnownZero/KnownOne bitsets.
	void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const override;

	bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
	TargetLoweringOpt &TLO) const override;

	MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;

	/// Returns true if the target allows unaligned memory accesses of the
	/// specified type.
	bool allowsMisalignedMemoryAccesses(
	EVT VT, unsigned AddrSpace = 0, unsigned Align = 1,
	MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
	bool *Fast = nullptr) const override;

	/// Provide custom lowering hooks for some operations.
	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;

	const char *getTargetNodeName(unsigned Opcode) const override;

	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

	/// Returns true if a cast between SrcAS and DestAS is a noop.
	bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
	// Addrspacecasts are always noops.
	return true;
	}

	/// This method returns a target specific FastISel object, or null if the
	/// target does not support "fast" ISel.
	FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const override;

	bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;

	bool isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const override;

	/// Return true if the given shuffle mask can be codegen'd directly, or if it
	/// should be stack expanded.
	bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;

	/// Return the ISD::SETCC ValueType.
	EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const override;

	SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;

	MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const override;

	bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const override;

	bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
	EVT NewVT) const override;

	bool isTruncateFree(Type Ty1, Type Ty2) const override;
	bool isTruncateFree(EVT VT1, EVT VT2) const override;

	bool isProfitableToHoist(Instruction *I) const override;

	bool isZExtFree(Type Ty1, Type Ty2) const override;
	bool isZExtFree(EVT VT1, EVT VT2) const override;
	bool isZExtFree(SDValue Val, EVT VT2) const override;

	bool shouldSinkOperands(Instruction *I,
	SmallVectorImpl<Use *> &Ops) const override;

	bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;

	unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

	bool lowerInterleavedLoad(LoadInst *LI,
	ArrayRef<ShuffleVectorInst *> Shuffles,
	ArrayRef<unsigned> Indices,
	unsigned Factor) const override;
	bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
	unsigned Factor) const override;

	bool isLegalAddImmediate(int64_t) const override;
	bool isLegalICmpImmediate(int64_t) const override;

	bool shouldConsiderGEPOffsetSplit() const override;

	EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const override;

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I = nullptr) const override;

	/// Return the cost of the scaling factor used in the addressing
	/// mode represented by AM for this target, for a load/store
	/// of the specified type.
	/// If the AM is supported, the return value must be >= 0.
	/// If the AM is not supported, it returns a negative value.
	int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
	unsigned AS) const override;

	/// Return true if an FMA operation is faster than a pair of fmul and fadd
	/// instructions. fmuladd intrinsics will be expanded to FMAs when this method
	/// returns true, otherwise fmuladd is expanded to fmul + fadd.
	bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;

	const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;

	/// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
	bool isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const override;

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const override;

	/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
	/// with this index.
	bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const override;

	Value emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const override;
	Value emitStoreConditional(IRBuilder<> &Builder, Value Val,
	Value *Addr, AtomicOrdering Ord) const override;

	void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;

	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
	bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;

	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;

	bool useLoadStackGuardNode() const override;
	TargetLoweringBase::LegalizeTypeAction
	getPreferredVectorAction(MVT VT) const override;

	/// If the target has a standard location for the stack protector cookie,
	/// returns the address of that location. Otherwise, returns nullptr.
	Value *getIRStackGuard(IRBuilder<> &IRB) const override;

	void insertSSPDeclarations(Module &M) const override;
	Value *getSDagStackGuard(const Module &M) const override;
	Function *getSSPStackGuardCheck(const Module &M) const override;

	/// If the target has a standard location for the unsafe stack pointer,
	/// returns the address of that location. Otherwise, returns nullptr.
	Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const override {
	// FIXME: This is a guess. Has this been defined yet?
	return AArch64::X0;
	}

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
	// FIXME: This is a guess. Has this been defined yet?
	return AArch64::X1;
	}

	bool isIntDivCheap(EVT VT, AttributeList Attr) const override;

	bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const override {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.

	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat)
	return (MemVT.getSizeInBits() <= 64);
	return true;
	}

	bool isCheapToSpeculateCttz() const override {
	return true;
	}

	bool isCheapToSpeculateCtlz() const override {
	return true;
	}

	bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;

	bool hasAndNotCompare(SDValue V) const override {
	// We can use bics for any scalar.
	return V.getValueType().isScalarInteger();
	}

	bool hasAndNot(SDValue Y) const override {
	EVT VT = Y.getValueType();

	if (!VT.isVector())
	return hasAndNotCompare(Y);

	return VT.getSizeInBits() >= 64; // vector 'bic'
	}

	- bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
	- if (DAG.getMachineFunction().getFunction().hasMinSize())
	- return false;
	- return true;
	- }
	+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;

	bool shouldTransformSignedTruncationCheck(EVT XVT,
	unsigned KeptBits) const override {
	// For vectors, we don't have a preference..
	if (XVT.isVector())
	return false;

	auto VTIsOk = [](EVT VT) -> bool {
	return VT == MVT::i8 \|\| VT == MVT::i16 \|\| VT == MVT::i32 \|\|
	VT == MVT::i64;
	};

	// We are ok with KeptBitsVT being byte/word/dword, what SXT supports.
	// XVT will be larger than KeptBitsVT.
	MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
	return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
	}

	bool preferIncOfAddToSubOfNot(EVT VT) const override;

	bool hasBitPreservingFPLogic(EVT VT) const override {
	// FIXME: Is this always true? It should be true for vectors at least.
	return VT == MVT::f32 \|\| VT == MVT::f64;
	}

	bool supportSplitCSR(MachineFunction *MF) const override {
	return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
	MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
	}
	void initializeSplitCSR(MachineBasicBlock *Entry) const override;
	void insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;

	bool supportSwiftError() const override {
	return true;
	}

	/// Enable aggressive FMA fusion on targets that want it.
	bool enableAggressiveFMAFusion(EVT VT) const override;

	/// Returns the size of the platform's va_list object.
	unsigned getVaListSizeInBits(const DataLayout &DL) const override;

	/// Returns true if \p VecTy is a legal interleaved access type. This
	/// function checks the vector element type and the overall width of the
	/// vector.
	bool isLegalInterleavedAccessType(VectorType *VecTy,
	const DataLayout &DL) const;

	/// Returns the number of interleaved accesses that will be generated when
	/// lowering accesses of the given type.
	unsigned getNumInterleavedAccesses(VectorType *VecTy,
	const DataLayout &DL) const;

	MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;

	bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
	CallingConv::ID CallConv,
	bool isVarArg) const override;
	/// Used for exception handling on Win64.
	bool needsFixedCatchObjects() const override;
	private:
	/// Keep a pointer to the AArch64Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const AArch64Subtarget *Subtarget;

	bool isExtFreeImpl(const Instruction *Ext) const override;

	void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
	void addDRTypeForNEON(MVT VT);
	void addQRTypeForNEON(MVT VT);

	SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &DL, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const override;

	SDValue LowerCall(CallLoweringInfo & /CLI/,
	SmallVectorImpl<SDValue> &InVals) const override;

	SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
	CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &DL, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
	SDValue ThisVal) const;

	SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;

	bool isEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;

	/// Finds the incoming stack arguments which overlap the given fixed stack
	/// object and incorporates their load into the current chain. This prevents
	/// an upcoming store from clobbering the stack argument before it's used.
	SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
	MachineFrameInfo &MFI, int ClobberedFI) const;

	bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;

	void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
	SDValue &Chain) const;

	bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const override;

	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
	SelectionDAG &DAG) const override;

	SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
	unsigned Flag) const;
	SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
	unsigned Flag) const;
	SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
	unsigned Flag) const;
	SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
	unsigned Flag) const;
	template <class NodeTy>
	SDValue getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
	template <class NodeTy>
	SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
	template <class NodeTy>
	SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
	template <class NodeTy>
	SDValue getAddrTiny(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
	SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
	SelectionDAG &DAG) const;
	SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
	SDValue TVal, SDValue FVal, const SDLoc &dl,
	SelectionDAG &DAG) const;
	SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const;
	SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
	SDValue &Size,
	SelectionDAG &DAG) const;

	SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const override;
	SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &ExtraSteps, bool &UseOneConst,
	bool Reciprocal) const override;
	SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &ExtraSteps) const override;
	unsigned combineRepeatedFPDivisors() const override;

	ConstraintType getConstraintType(StringRef Constraint) const override;
	unsigned getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const override;

	/// Examine constraint string and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	ConstraintWeight
	getSingleConstraintMatchWeight(AsmOperandInfo &info,
	const char *constraint) const override;

	std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const override;

	const char *LowerXConstraint(EVT ConstraintVT) const override;

	void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const override;

	unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
	if (ConstraintCode == "Q")
	return InlineAsm::Constraint_Q;
	// FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are
	// followed by llvm_unreachable so we'll leave them unimplemented in
	// the backend for now.
	return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
	bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
	bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
	ISD::MemIndexedMode &AM, bool &IsInc,
	SelectionDAG &DAG) const;
	bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const override;
	bool getPostIndexedAddressParts(SDNode N, SDNode Op, SDValue &Base,
	SDValue &Offset, ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const override;

	void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const override;

	bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;

	void finalizeLowering(MachineFunction &MF) const override;
	};

	namespace AArch64 {
	FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo);
	} // end namespace AArch64

	} // end namespace llvm

	#endif
	Index: projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (revision 351722)
	@@ -1,5561 +1,5573 @@
	//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the AArch64 implementation of the TargetInstrInfo class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/GlobalValue.h"
	+#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <utility>

	using namespace llvm;

	#define GET_INSTRINFO_CTOR_DTOR
	#include "AArch64GenInstrInfo.inc"

	static cl::opt<unsigned> TBZDisplacementBits(
	"aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
	cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));

	static cl::opt<unsigned> CBZDisplacementBits(
	"aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
	cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));

	static cl::opt<unsigned>
	BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
	cl::desc("Restrict range of Bcc instructions (DEBUG)"));

	AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
	: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
	AArch64::CATCHRET),
	RI(STI.getTargetTriple()), Subtarget(STI) {}

	/// GetInstSize - Return the number of bytes of code the specified
	/// instruction may be. This returns the maximum number of bytes.
	unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
	const MachineBasicBlock &MBB = *MI.getParent();
	const MachineFunction *MF = MBB.getParent();
	const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();

	{
	auto Op = MI.getOpcode();
	if (Op == AArch64::INLINEASM \|\| Op == AArch64::INLINEASM_BR)
	return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
	}

	// FIXME: We currently only handle pseudoinstructions that don't get expanded
	// before the assembly printer.
	unsigned NumBytes = 0;
	const MCInstrDesc &Desc = MI.getDesc();
	switch (Desc.getOpcode()) {
	default:
	// Anything not explicitly designated otherwise is a normal 4-byte insn.
	NumBytes = 4;
	break;
	case TargetOpcode::DBG_VALUE:
	case TargetOpcode::EH_LABEL:
	case TargetOpcode::IMPLICIT_DEF:
	case TargetOpcode::KILL:
	NumBytes = 0;
	break;
	case TargetOpcode::STACKMAP:
	// The upper bound for a stackmap intrinsic is the full length of its shadow
	NumBytes = StackMapOpers(&MI).getNumPatchBytes();
	assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
	break;
	case TargetOpcode::PATCHPOINT:
	// The size of the patchpoint intrinsic is the number of bytes requested
	NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
	assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
	break;
	case AArch64::TLSDESC_CALLSEQ:
	// This gets lowered to an instruction sequence which takes 16 bytes
	NumBytes = 16;
	break;
	case AArch64::JumpTableDest32:
	case AArch64::JumpTableDest16:
	case AArch64::JumpTableDest8:
	NumBytes = 12;
	break;
	case AArch64::SPACE:
	NumBytes = MI.getOperand(1).getImm();
	break;
	}

	return NumBytes;
	}

	static void parseCondBranch(MachineInstr LastInst, MachineBasicBlock &Target,
	SmallVectorImpl<MachineOperand> &Cond) {
	// Block ends with fall-through condbranch.
	switch (LastInst->getOpcode()) {
	default:
	llvm_unreachable("Unknown branch instruction?");
	case AArch64::Bcc:
	Target = LastInst->getOperand(1).getMBB();
	Cond.push_back(LastInst->getOperand(0));
	break;
	case AArch64::CBZW:
	case AArch64::CBZX:
	case AArch64::CBNZW:
	case AArch64::CBNZX:
	Target = LastInst->getOperand(1).getMBB();
	Cond.push_back(MachineOperand::CreateImm(-1));
	Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
	Cond.push_back(LastInst->getOperand(0));
	break;
	case AArch64::TBZW:
	case AArch64::TBZX:
	case AArch64::TBNZW:
	case AArch64::TBNZX:
	Target = LastInst->getOperand(2).getMBB();
	Cond.push_back(MachineOperand::CreateImm(-1));
	Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
	Cond.push_back(LastInst->getOperand(0));
	Cond.push_back(LastInst->getOperand(1));
	}
	}

	static unsigned getBranchDisplacementBits(unsigned Opc) {
	switch (Opc) {
	default:
	llvm_unreachable("unexpected opcode!");
	case AArch64::B:
	return 64;
	case AArch64::TBNZW:
	case AArch64::TBZW:
	case AArch64::TBNZX:
	case AArch64::TBZX:
	return TBZDisplacementBits;
	case AArch64::CBNZW:
	case AArch64::CBZW:
	case AArch64::CBNZX:
	case AArch64::CBZX:
	return CBZDisplacementBits;
	case AArch64::Bcc:
	return BCCDisplacementBits;
	}
	}

	bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
	int64_t BrOffset) const {
	unsigned Bits = getBranchDisplacementBits(BranchOp);
	assert(Bits >= 3 && "max branch displacement must be enough to jump"
	"over conditional branch expansion");
	return isIntN(Bits, BrOffset / 4);
	}

	MachineBasicBlock *
	AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("unexpected opcode!");
	case AArch64::B:
	return MI.getOperand(0).getMBB();
	case AArch64::TBZW:
	case AArch64::TBNZW:
	case AArch64::TBZX:
	case AArch64::TBNZX:
	return MI.getOperand(2).getMBB();
	case AArch64::CBZW:
	case AArch64::CBNZW:
	case AArch64::CBZX:
	case AArch64::CBNZX:
	case AArch64::Bcc:
	return MI.getOperand(1).getMBB();
	}
	}

	// Branch analysis.
	bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify) const {
	// If the block has no terminators, it just falls into the block after it.
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return false;

	if (!isUnpredicatedTerminator(*I))
	return false;

	// Get the last instruction in the block.
	MachineInstr LastInst = &I;

	// If there is only one terminator instruction, process it.
	unsigned LastOpc = LastInst->getOpcode();
	if (I == MBB.begin() \|\| !isUnpredicatedTerminator(*--I)) {
	if (isUncondBranchOpcode(LastOpc)) {
	TBB = LastInst->getOperand(0).getMBB();
	return false;
	}
	if (isCondBranchOpcode(LastOpc)) {
	// Block ends with fall-through condbranch.
	parseCondBranch(LastInst, TBB, Cond);
	return false;
	}
	return true; // Can't handle indirect branch.
	}

	// Get the instruction before it if it is a terminator.
	MachineInstr SecondLastInst = &I;
	unsigned SecondLastOpc = SecondLastInst->getOpcode();

	// If AllowModify is true and the block ends with two or more unconditional
	// branches, delete all but the first unconditional branch.
	if (AllowModify && isUncondBranchOpcode(LastOpc)) {
	while (isUncondBranchOpcode(SecondLastOpc)) {
	LastInst->eraseFromParent();
	LastInst = SecondLastInst;
	LastOpc = LastInst->getOpcode();
	if (I == MBB.begin() \|\| !isUnpredicatedTerminator(*--I)) {
	// Return now the only terminator is an unconditional branch.
	TBB = LastInst->getOperand(0).getMBB();
	return false;
	} else {
	SecondLastInst = &*I;
	SecondLastOpc = SecondLastInst->getOpcode();
	}
	}
	}

	// If there are three terminators, we don't know what sort of block this is.
	if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
	return true;

	// If the block ends with a B and a Bcc, handle it.
	if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
	parseCondBranch(SecondLastInst, TBB, Cond);
	FBB = LastInst->getOperand(0).getMBB();
	return false;
	}

	// If the block ends with two unconditional branches, handle it. The second
	// one is not executed, so remove it.
	if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
	TBB = SecondLastInst->getOperand(0).getMBB();
	I = LastInst;
	if (AllowModify)
	I->eraseFromParent();
	return false;
	}

	// ...likewise if it ends with an indirect branch followed by an unconditional
	// branch.
	if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
	I = LastInst;
	if (AllowModify)
	I->eraseFromParent();
	return true;
	}

	// Otherwise, can't handle this.
	return true;
	}

	bool AArch64InstrInfo::reverseBranchCondition(
	SmallVectorImpl<MachineOperand> &Cond) const {
	if (Cond[0].getImm() != -1) {
	// Regular Bcc
	AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
	Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
	} else {
	// Folded compare-and-branch
	switch (Cond[1].getImm()) {
	default:
	llvm_unreachable("Unknown conditional branch!");
	case AArch64::CBZW:
	Cond[1].setImm(AArch64::CBNZW);
	break;
	case AArch64::CBNZW:
	Cond[1].setImm(AArch64::CBZW);
	break;
	case AArch64::CBZX:
	Cond[1].setImm(AArch64::CBNZX);
	break;
	case AArch64::CBNZX:
	Cond[1].setImm(AArch64::CBZX);
	break;
	case AArch64::TBZW:
	Cond[1].setImm(AArch64::TBNZW);
	break;
	case AArch64::TBNZW:
	Cond[1].setImm(AArch64::TBZW);
	break;
	case AArch64::TBZX:
	Cond[1].setImm(AArch64::TBNZX);
	break;
	case AArch64::TBNZX:
	Cond[1].setImm(AArch64::TBZX);
	break;
	}
	}

	return false;
	}

	unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved) const {
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return 0;

	if (!isUncondBranchOpcode(I->getOpcode()) &&
	!isCondBranchOpcode(I->getOpcode()))
	return 0;

	// Remove the branch.
	I->eraseFromParent();

	I = MBB.end();

	if (I == MBB.begin()) {
	if (BytesRemoved)
	*BytesRemoved = 4;
	return 1;
	}
	--I;
	if (!isCondBranchOpcode(I->getOpcode())) {
	if (BytesRemoved)
	*BytesRemoved = 4;
	return 1;
	}

	// Remove the branch.
	I->eraseFromParent();
	if (BytesRemoved)
	*BytesRemoved = 8;

	return 2;
	}

	void AArch64InstrInfo::instantiateCondBranch(
	MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
	ArrayRef<MachineOperand> Cond) const {
	if (Cond[0].getImm() != -1) {
	// Regular Bcc
	BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
	} else {
	// Folded compare-and-branch
	// Note that we use addOperand instead of addReg to keep the flags.
	const MachineInstrBuilder MIB =
	BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
	if (Cond.size() > 3)
	MIB.addImm(Cond[3].getImm());
	MIB.addMBB(TBB);
	}
	}

	unsigned AArch64InstrInfo::insertBranch(
	MachineBasicBlock &MBB, MachineBasicBlock TBB, MachineBasicBlock FBB,
	ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
	// Shouldn't be a fall through.
	assert(TBB && "insertBranch must not be told to insert a fallthrough");

	if (!FBB) {
	if (Cond.empty()) // Unconditional branch?
	BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
	else
	instantiateCondBranch(MBB, DL, TBB, Cond);

	if (BytesAdded)
	*BytesAdded = 4;

	return 1;
	}

	// Two-way conditional branch.
	instantiateCondBranch(MBB, DL, TBB, Cond);
	BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);

	if (BytesAdded)
	*BytesAdded = 8;

	return 2;
	}

	// Find the original register that VReg is copied from.
	static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
	while (TargetRegisterInfo::isVirtualRegister(VReg)) {
	const MachineInstr *DefMI = MRI.getVRegDef(VReg);
	if (!DefMI->isFullCopy())
	return VReg;
	VReg = DefMI->getOperand(1).getReg();
	}
	return VReg;
	}

	// Determine if VReg is defined by an instruction that can be folded into a
	// csel instruction. If so, return the folded opcode, and the replacement
	// register.
	static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
	unsigned *NewVReg = nullptr) {
	VReg = removeCopies(MRI, VReg);
	if (!TargetRegisterInfo::isVirtualRegister(VReg))
	return 0;

	bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
	const MachineInstr *DefMI = MRI.getVRegDef(VReg);
	unsigned Opc = 0;
	unsigned SrcOpNum = 0;
	switch (DefMI->getOpcode()) {
	case AArch64::ADDSXri:
	case AArch64::ADDSWri:
	// if NZCV is used, do not fold.
	if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
	return 0;
	// fall-through to ADDXri and ADDWri.
	LLVM_FALLTHROUGH;
	case AArch64::ADDXri:
	case AArch64::ADDWri:
	// add x, 1 -> csinc.
	if (!DefMI->getOperand(2).isImm() \|\| DefMI->getOperand(2).getImm() != 1 \|\|
	DefMI->getOperand(3).getImm() != 0)
	return 0;
	SrcOpNum = 1;
	Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
	break;

	case AArch64::ORNXrr:
	case AArch64::ORNWrr: {
	// not x -> csinv, represented as orn dst, xzr, src.
	unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
	if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
	return 0;
	SrcOpNum = 2;
	Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
	break;
	}

	case AArch64::SUBSXrr:
	case AArch64::SUBSWrr:
	// if NZCV is used, do not fold.
	if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
	return 0;
	// fall-through to SUBXrr and SUBWrr.
	LLVM_FALLTHROUGH;
	case AArch64::SUBXrr:
	case AArch64::SUBWrr: {
	// neg x -> csneg, represented as sub dst, xzr, src.
	unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
	if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
	return 0;
	SrcOpNum = 2;
	Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
	break;
	}
	default:
	return 0;
	}
	assert(Opc && SrcOpNum && "Missing parameters");

	if (NewVReg)
	*NewVReg = DefMI->getOperand(SrcOpNum).getReg();
	return Opc;
	}

	bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
	ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg,
	int &CondCycles, int &TrueCycles,
	int &FalseCycles) const {
	// Check register classes.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *RC =
	RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
	if (!RC)
	return false;

	// Expanding cbz/tbz requires an extra cycle of latency on the condition.
	unsigned ExtraCondLat = Cond.size() != 1;

	// GPRs are handled by csel.
	// FIXME: Fold in x+1, -x, and ~x when applicable.
	if (AArch64::GPR64allRegClass.hasSubClassEq(RC) \|\|
	AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
	// Single-cycle csel, csinc, csinv, and csneg.
	CondCycles = 1 + ExtraCondLat;
	TrueCycles = FalseCycles = 1;
	if (canFoldIntoCSel(MRI, TrueReg))
	TrueCycles = 0;
	else if (canFoldIntoCSel(MRI, FalseReg))
	FalseCycles = 0;
	return true;
	}

	// Scalar floating point is handled by fcsel.
	// FIXME: Form fabs, fmin, and fmax when applicable.
	if (AArch64::FPR64RegClass.hasSubClassEq(RC) \|\|
	AArch64::FPR32RegClass.hasSubClassEq(RC)) {
	CondCycles = 5 + ExtraCondLat;
	TrueCycles = FalseCycles = 2;
	return true;
	}

	// Can't do vectors.
	return false;
	}

	void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, unsigned DstReg,
	ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg) const {
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();

	// Parse the condition code, see parseCondBranch() above.
	AArch64CC::CondCode CC;
	switch (Cond.size()) {
	default:
	llvm_unreachable("Unknown condition opcode in Cond");
	case 1: // b.cc
	CC = AArch64CC::CondCode(Cond[0].getImm());
	break;
	case 3: { // cbz/cbnz
	// We must insert a compare against 0.
	bool Is64Bit;
	switch (Cond[1].getImm()) {
	default:
	llvm_unreachable("Unknown branch opcode in Cond");
	case AArch64::CBZW:
	Is64Bit = false;
	CC = AArch64CC::EQ;
	break;
	case AArch64::CBZX:
	Is64Bit = true;
	CC = AArch64CC::EQ;
	break;
	case AArch64::CBNZW:
	Is64Bit = false;
	CC = AArch64CC::NE;
	break;
	case AArch64::CBNZX:
	Is64Bit = true;
	CC = AArch64CC::NE;
	break;
	}
	unsigned SrcReg = Cond[2].getReg();
	if (Is64Bit) {
	// cmp reg, #0 is actually subs xzr, reg, #0.
	MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
	BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
	.addReg(SrcReg)
	.addImm(0)
	.addImm(0);
	} else {
	MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
	BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
	.addReg(SrcReg)
	.addImm(0)
	.addImm(0);
	}
	break;
	}
	case 4: { // tbz/tbnz
	// We must insert a tst instruction.
	switch (Cond[1].getImm()) {
	default:
	llvm_unreachable("Unknown branch opcode in Cond");
	case AArch64::TBZW:
	case AArch64::TBZX:
	CC = AArch64CC::EQ;
	break;
	case AArch64::TBNZW:
	case AArch64::TBNZX:
	CC = AArch64CC::NE;
	break;
	}
	// cmp reg, #foo is actually ands xzr, reg, #1<<foo.
	if (Cond[1].getImm() == AArch64::TBZW \|\| Cond[1].getImm() == AArch64::TBNZW)
	BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
	.addReg(Cond[2].getReg())
	.addImm(
	AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
	else
	BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
	.addReg(Cond[2].getReg())
	.addImm(
	AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
	break;
	}
	}

	unsigned Opc = 0;
	const TargetRegisterClass *RC = nullptr;
	bool TryFold = false;
	if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
	RC = &AArch64::GPR64RegClass;
	Opc = AArch64::CSELXr;
	TryFold = true;
	} else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
	RC = &AArch64::GPR32RegClass;
	Opc = AArch64::CSELWr;
	TryFold = true;
	} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
	RC = &AArch64::FPR64RegClass;
	Opc = AArch64::FCSELDrrr;
	} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
	RC = &AArch64::FPR32RegClass;
	Opc = AArch64::FCSELSrrr;
	}
	assert(RC && "Unsupported regclass");

	// Try folding simple instructions into the csel.
	if (TryFold) {
	unsigned NewVReg = 0;
	unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
	if (FoldedOpc) {
	// The folded opcodes csinc, csinc and csneg apply the operation to
	// FalseReg, so we need to invert the condition.
	CC = AArch64CC::getInvertedCondCode(CC);
	TrueReg = FalseReg;
	} else
	FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);

	// Fold the operation. Leave any dead instructions for DCE to clean up.
	if (FoldedOpc) {
	FalseReg = NewVReg;
	Opc = FoldedOpc;
	// The extends the live range of NewVReg.
	MRI.clearKillFlags(NewVReg);
	}
	}

	// Pull all virtual register into the appropriate class.
	MRI.constrainRegClass(TrueReg, RC);
	MRI.constrainRegClass(FalseReg, RC);

	// Insert the csel.
	BuildMI(MBB, I, DL, get(Opc), DstReg)
	.addReg(TrueReg)
	.addReg(FalseReg)
	.addImm(CC);
	}

	/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
	static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
	uint64_t Imm = MI.getOperand(1).getImm();
	uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
	uint64_t Encoding;
	return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
	}

	// FIXME: this implementation should be micro-architecture dependent, so a
	// micro-architecture target hook should be introduced here in future.
	bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
	if (!Subtarget.hasCustomCheapAsMoveHandling())
	return MI.isAsCheapAsAMove();

	const unsigned Opcode = MI.getOpcode();

	// Firstly, check cases gated by features.

	if (Subtarget.hasZeroCycleZeroingFP()) {
	if (Opcode == AArch64::FMOVH0 \|\|
	Opcode == AArch64::FMOVS0 \|\|
	Opcode == AArch64::FMOVD0)
	return true;
	}

	if (Subtarget.hasZeroCycleZeroingGP()) {
	if (Opcode == TargetOpcode::COPY &&
	(MI.getOperand(1).getReg() == AArch64::WZR \|\|
	MI.getOperand(1).getReg() == AArch64::XZR))
	return true;
	}

	// Secondly, check cases specific to sub-targets.

	if (Subtarget.hasExynosCheapAsMoveHandling()) {
	if (isExynosCheapAsMove(MI))
	return true;

	return MI.isAsCheapAsAMove();
	}

	// Finally, check generic cases.

	switch (Opcode) {
	default:
	return false;

	// add/sub on register without shift
	case AArch64::ADDWri:
	case AArch64::ADDXri:
	case AArch64::SUBWri:
	case AArch64::SUBXri:
	return (MI.getOperand(3).getImm() == 0);

	// logical ops on immediate
	case AArch64::ANDWri:
	case AArch64::ANDXri:
	case AArch64::EORWri:
	case AArch64::EORXri:
	case AArch64::ORRWri:
	case AArch64::ORRXri:
	return true;

	// logical ops on register without shift
	case AArch64::ANDWrr:
	case AArch64::ANDXrr:
	case AArch64::BICWrr:
	case AArch64::BICXrr:
	case AArch64::EONWrr:
	case AArch64::EONXrr:
	case AArch64::EORWrr:
	case AArch64::EORXrr:
	case AArch64::ORNWrr:
	case AArch64::ORNXrr:
	case AArch64::ORRWrr:
	case AArch64::ORRXrr:
	return true;

	// If MOVi32imm or MOVi64imm can be expanded into ORRWri or
	// ORRXri, it is as cheap as MOV
	case AArch64::MOVi32imm:
	return canBeExpandedToORR(MI, 32);
	case AArch64::MOVi64imm:
	return canBeExpandedToORR(MI, 64);
	}

	llvm_unreachable("Unknown opcode to check as cheap as a move!");
	}

	bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;

	case AArch64::ADDWrs:
	case AArch64::ADDXrs:
	case AArch64::ADDSWrs:
	case AArch64::ADDSXrs: {
	unsigned Imm = MI.getOperand(3).getImm();
	unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
	if (ShiftVal == 0)
	return true;
	return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
	}

	case AArch64::ADDWrx:
	case AArch64::ADDXrx:
	case AArch64::ADDXrx64:
	case AArch64::ADDSWrx:
	case AArch64::ADDSXrx:
	case AArch64::ADDSXrx64: {
	unsigned Imm = MI.getOperand(3).getImm();
	switch (AArch64_AM::getArithExtendType(Imm)) {
	default:
	return false;
	case AArch64_AM::UXTB:
	case AArch64_AM::UXTH:
	case AArch64_AM::UXTW:
	case AArch64_AM::UXTX:
	return AArch64_AM::getArithShiftValue(Imm) <= 4;
	}
	}

	case AArch64::SUBWrs:
	case AArch64::SUBSWrs: {
	unsigned Imm = MI.getOperand(3).getImm();
	unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
	return ShiftVal == 0 \|\|
	(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
	}

	case AArch64::SUBXrs:
	case AArch64::SUBSXrs: {
	unsigned Imm = MI.getOperand(3).getImm();
	unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
	return ShiftVal == 0 \|\|
	(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
	}

	case AArch64::SUBWrx:
	case AArch64::SUBXrx:
	case AArch64::SUBXrx64:
	case AArch64::SUBSWrx:
	case AArch64::SUBSXrx:
	case AArch64::SUBSXrx64: {
	unsigned Imm = MI.getOperand(3).getImm();
	switch (AArch64_AM::getArithExtendType(Imm)) {
	default:
	return false;
	case AArch64_AM::UXTB:
	case AArch64_AM::UXTH:
	case AArch64_AM::UXTW:
	case AArch64_AM::UXTX:
	return AArch64_AM::getArithShiftValue(Imm) == 0;
	}
	}

	case AArch64::LDRBBroW:
	case AArch64::LDRBBroX:
	case AArch64::LDRBroW:
	case AArch64::LDRBroX:
	case AArch64::LDRDroW:
	case AArch64::LDRDroX:
	case AArch64::LDRHHroW:
	case AArch64::LDRHHroX:
	case AArch64::LDRHroW:
	case AArch64::LDRHroX:
	case AArch64::LDRQroW:
	case AArch64::LDRQroX:
	case AArch64::LDRSBWroW:
	case AArch64::LDRSBWroX:
	case AArch64::LDRSBXroW:
	case AArch64::LDRSBXroX:
	case AArch64::LDRSHWroW:
	case AArch64::LDRSHWroX:
	case AArch64::LDRSHXroW:
	case AArch64::LDRSHXroX:
	case AArch64::LDRSWroW:
	case AArch64::LDRSWroX:
	case AArch64::LDRSroW:
	case AArch64::LDRSroX:
	case AArch64::LDRWroW:
	case AArch64::LDRWroX:
	case AArch64::LDRXroW:
	case AArch64::LDRXroX:
	case AArch64::PRFMroW:
	case AArch64::PRFMroX:
	case AArch64::STRBBroW:
	case AArch64::STRBBroX:
	case AArch64::STRBroW:
	case AArch64::STRBroX:
	case AArch64::STRDroW:
	case AArch64::STRDroX:
	case AArch64::STRHHroW:
	case AArch64::STRHHroX:
	case AArch64::STRHroW:
	case AArch64::STRHroX:
	case AArch64::STRQroW:
	case AArch64::STRQroX:
	case AArch64::STRSroW:
	case AArch64::STRSroX:
	case AArch64::STRWroW:
	case AArch64::STRWroX:
	case AArch64::STRXroW:
	case AArch64::STRXroX: {
	unsigned IsSigned = MI.getOperand(3).getImm();
	return !IsSigned;
	}
	}
	}

	bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
	unsigned Opc = MI.getOpcode();
	switch (Opc) {
	default:
	return false;
	case AArch64::SEH_StackAlloc:
	case AArch64::SEH_SaveFPLR:
	case AArch64::SEH_SaveFPLR_X:
	case AArch64::SEH_SaveReg:
	case AArch64::SEH_SaveReg_X:
	case AArch64::SEH_SaveRegP:
	case AArch64::SEH_SaveRegP_X:
	case AArch64::SEH_SaveFReg:
	case AArch64::SEH_SaveFReg_X:
	case AArch64::SEH_SaveFRegP:
	case AArch64::SEH_SaveFRegP_X:
	case AArch64::SEH_SetFP:
	case AArch64::SEH_AddFP:
	case AArch64::SEH_Nop:
	case AArch64::SEH_PrologEnd:
	case AArch64::SEH_EpilogStart:
	case AArch64::SEH_EpilogEnd:
	return true;
	}
	}

	bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
	unsigned &SrcReg, unsigned &DstReg,
	unsigned &SubIdx) const {
	switch (MI.getOpcode()) {
	default:
	return false;
	case AArch64::SBFMXri: // aka sxtw
	case AArch64::UBFMXri: // aka uxtw
	// Check for the 32 -> 64 bit extension case, these instructions can do
	// much more.
	if (MI.getOperand(2).getImm() != 0 \|\| MI.getOperand(3).getImm() != 31)
	return false;
	// This is a signed or unsigned 32 -> 64 bit extension.
	SrcReg = MI.getOperand(1).getReg();
	DstReg = MI.getOperand(0).getReg();
	SubIdx = AArch64::sub_32;
	return true;
	}
	}

	bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
	const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const {
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	const MachineOperand BaseOpA = nullptr, BaseOpB = nullptr;
	int64_t OffsetA = 0, OffsetB = 0;
	unsigned WidthA = 0, WidthB = 0;

	assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
	assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");

	if (MIa.hasUnmodeledSideEffects() \|\| MIb.hasUnmodeledSideEffects() \|\|
	MIa.hasOrderedMemoryRef() \|\| MIb.hasOrderedMemoryRef())
	return false;

	// Retrieve the base, offset from the base and width. Width
	// is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
	// base are identical, and the offset of a lower memory access +
	// the width doesn't overlap the offset of a higher memory access,
	// then the memory accesses are different.
	if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
	getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
	if (BaseOpA->isIdenticalTo(*BaseOpB)) {
	int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
	int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
	int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
	if (LowOffset + LowWidth <= HighOffset)
	return true;
	}
	}
	return false;
	}

	bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
	const MachineBasicBlock *MBB,
	const MachineFunction &MF) const {
	if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
	return true;
	switch (MI.getOpcode()) {
	case AArch64::HINT:
	// CSDB hints are scheduling barriers.
	if (MI.getOperand(0).getImm() == 0x14)
	return true;
	break;
	case AArch64::DSB:
	case AArch64::ISB:
	// DSB and ISB also are scheduling barriers.
	return true;
	default:;
	}
	return isSEHInstruction(MI);
	}

	/// analyzeCompare - For a comparison instruction, return the source registers
	/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
	/// Return true if the comparison instruction can be analyzed.
	bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
	unsigned &SrcReg2, int &CmpMask,
	int &CmpValue) const {
	// The first operand can be a frame index where we'd normally expect a
	// register.
	assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
	if (!MI.getOperand(1).isReg())
	return false;

	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::SUBSWrr:
	case AArch64::SUBSWrs:
	case AArch64::SUBSWrx:
	case AArch64::SUBSXrr:
	case AArch64::SUBSXrs:
	case AArch64::SUBSXrx:
	case AArch64::ADDSWrr:
	case AArch64::ADDSWrs:
	case AArch64::ADDSWrx:
	case AArch64::ADDSXrr:
	case AArch64::ADDSXrs:
	case AArch64::ADDSXrx:
	// Replace SUBSWrr with SUBWrr if NZCV is not used.
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = MI.getOperand(2).getReg();
	CmpMask = ~0;
	CmpValue = 0;
	return true;
	case AArch64::SUBSWri:
	case AArch64::ADDSWri:
	case AArch64::SUBSXri:
	case AArch64::ADDSXri:
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = 0;
	CmpMask = ~0;
	// FIXME: In order to convert CmpValue to 0 or 1
	CmpValue = MI.getOperand(2).getImm() != 0;
	return true;
	case AArch64::ANDSWri:
	case AArch64::ANDSXri:
	// ANDS does not use the same encoding scheme as the others xxxS
	// instructions.
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = 0;
	CmpMask = ~0;
	// FIXME:The return val type of decodeLogicalImmediate is uint64_t,
	// while the type of CmpValue is int. When converting uint64_t to int,
	// the high 32 bits of uint64_t will be lost.
	// In fact it causes a bug in spec2006-483.xalancbmk
	// CmpValue is only used to compare with zero in OptimizeCompareInstr
	CmpValue = AArch64_AM::decodeLogicalImmediate(
	MI.getOperand(2).getImm(),
	MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
	return true;
	}

	return false;
	}

	static bool UpdateOperandRegClass(MachineInstr &Instr) {
	MachineBasicBlock *MBB = Instr.getParent();
	assert(MBB && "Can't get MachineBasicBlock here");
	MachineFunction *MF = MBB->getParent();
	assert(MF && "Can't get MachineFunction here");
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
	MachineRegisterInfo *MRI = &MF->getRegInfo();

	for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
	++OpIdx) {
	MachineOperand &MO = Instr.getOperand(OpIdx);
	const TargetRegisterClass *OpRegCstraints =
	Instr.getRegClassConstraint(OpIdx, TII, TRI);

	// If there's no constraint, there's nothing to do.
	if (!OpRegCstraints)
	continue;
	// If the operand is a frame index, there's nothing to do here.
	// A frame index operand will resolve correctly during PEI.
	if (MO.isFI())
	continue;

	assert(MO.isReg() &&
	"Operand has register constraints without being a register!");

	unsigned Reg = MO.getReg();
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	if (!OpRegCstraints->contains(Reg))
	return false;
	} else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
	!MRI->constrainRegClass(Reg, OpRegCstraints))
	return false;
	}

	return true;
	}

	/// Return the opcode that does not set flags when possible - otherwise
	/// return the original opcode. The caller is responsible to do the actual
	/// substitution and legality checking.
	static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
	// Don't convert all compare instructions, because for some the zero register
	// encoding becomes the sp register.
	bool MIDefinesZeroReg = false;
	if (MI.definesRegister(AArch64::WZR) \|\| MI.definesRegister(AArch64::XZR))
	MIDefinesZeroReg = true;

	switch (MI.getOpcode()) {
	default:
	return MI.getOpcode();
	case AArch64::ADDSWrr:
	return AArch64::ADDWrr;
	case AArch64::ADDSWri:
	return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
	case AArch64::ADDSWrs:
	return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
	case AArch64::ADDSWrx:
	return AArch64::ADDWrx;
	case AArch64::ADDSXrr:
	return AArch64::ADDXrr;
	case AArch64::ADDSXri:
	return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
	case AArch64::ADDSXrs:
	return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
	case AArch64::ADDSXrx:
	return AArch64::ADDXrx;
	case AArch64::SUBSWrr:
	return AArch64::SUBWrr;
	case AArch64::SUBSWri:
	return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
	case AArch64::SUBSWrs:
	return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
	case AArch64::SUBSWrx:
	return AArch64::SUBWrx;
	case AArch64::SUBSXrr:
	return AArch64::SUBXrr;
	case AArch64::SUBSXri:
	return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
	case AArch64::SUBSXrs:
	return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
	case AArch64::SUBSXrx:
	return AArch64::SUBXrx;
	}
	}

	enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };

	/// True when condition flags are accessed (either by writing or reading)
	/// on the instruction trace starting at From and ending at To.
	///
	/// Note: If From and To are from different blocks it's assumed CC are accessed
	/// on the path.
	static bool areCFlagsAccessedBetweenInstrs(
	MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
	const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
	// Early exit if To is at the beginning of the BB.
	if (To == To->getParent()->begin())
	return true;

	// Check whether the instructions are in the same basic block
	// If not, assume the condition flags might get modified somewhere.
	if (To->getParent() != From->getParent())
	return true;

	// From must be above To.
	assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
	[From](MachineInstr &MI) {
	return MI.getIterator() == From;
	}) != To->getParent()->rend());

	// We iterate backward starting \p To until we hit \p From.
	for (--To; To != From; --To) {
	const MachineInstr &Instr = *To;

	if (((AccessToCheck & AK_Write) &&
	Instr.modifiesRegister(AArch64::NZCV, TRI)) \|\|
	((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
	return true;
	}
	return false;
	}

	/// Try to optimize a compare instruction. A compare instruction is an
	/// instruction which produces AArch64::NZCV. It can be truly compare
	/// instruction
	/// when there are no uses of its destination register.
	///
	/// The following steps are tried in order:
	/// 1. Convert CmpInstr into an unconditional version.
	/// 2. Remove CmpInstr if above there is an instruction producing a needed
	/// condition code or an instruction which can be converted into such an
	/// instruction.
	/// Only comparison with zero is supported.
	bool AArch64InstrInfo::optimizeCompareInstr(
	MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
	int CmpValue, const MachineRegisterInfo *MRI) const {
	assert(CmpInstr.getParent());
	assert(MRI);

	// Replace SUBSWrr with SUBWrr if NZCV is not used.
	int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
	if (DeadNZCVIdx != -1) {
	if (CmpInstr.definesRegister(AArch64::WZR) \|\|
	CmpInstr.definesRegister(AArch64::XZR)) {
	CmpInstr.eraseFromParent();
	return true;
	}
	unsigned Opc = CmpInstr.getOpcode();
	unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
	if (NewOpc == Opc)
	return false;
	const MCInstrDesc &MCID = get(NewOpc);
	CmpInstr.setDesc(MCID);
	CmpInstr.RemoveOperand(DeadNZCVIdx);
	bool succeeded = UpdateOperandRegClass(CmpInstr);
	(void)succeeded;
	assert(succeeded && "Some operands reg class are incompatible!");
	return true;
	}

	// Continue only if we have a "ri" where immediate is zero.
	// FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
	// function.
	assert((CmpValue == 0 \|\| CmpValue == 1) && "CmpValue must be 0 or 1!");
	if (CmpValue != 0 \|\| SrcReg2 != 0)
	return false;

	// CmpInstr is a Compare instruction if destination register is not used.
	if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
	return false;

	return substituteCmpToZero(CmpInstr, SrcReg, MRI);
	}

	/// Get opcode of S version of Instr.
	/// If Instr is S version its opcode is returned.
	/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
	/// or we are not interested in it.
	static unsigned sForm(MachineInstr &Instr) {
	switch (Instr.getOpcode()) {
	default:
	return AArch64::INSTRUCTION_LIST_END;

	case AArch64::ADDSWrr:
	case AArch64::ADDSWri:
	case AArch64::ADDSXrr:
	case AArch64::ADDSXri:
	case AArch64::SUBSWrr:
	case AArch64::SUBSWri:
	case AArch64::SUBSXrr:
	case AArch64::SUBSXri:
	return Instr.getOpcode();

	case AArch64::ADDWrr:
	return AArch64::ADDSWrr;
	case AArch64::ADDWri:
	return AArch64::ADDSWri;
	case AArch64::ADDXrr:
	return AArch64::ADDSXrr;
	case AArch64::ADDXri:
	return AArch64::ADDSXri;
	case AArch64::ADCWr:
	return AArch64::ADCSWr;
	case AArch64::ADCXr:
	return AArch64::ADCSXr;
	case AArch64::SUBWrr:
	return AArch64::SUBSWrr;
	case AArch64::SUBWri:
	return AArch64::SUBSWri;
	case AArch64::SUBXrr:
	return AArch64::SUBSXrr;
	case AArch64::SUBXri:
	return AArch64::SUBSXri;
	case AArch64::SBCWr:
	return AArch64::SBCSWr;
	case AArch64::SBCXr:
	return AArch64::SBCSXr;
	case AArch64::ANDWri:
	return AArch64::ANDSWri;
	case AArch64::ANDXri:
	return AArch64::ANDSXri;
	}
	}

	/// Check if AArch64::NZCV should be alive in successors of MBB.
	static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
	for (auto *BB : MBB->successors())
	if (BB->isLiveIn(AArch64::NZCV))
	return true;
	return false;
	}

	namespace {

	struct UsedNZCV {
	bool N = false;
	bool Z = false;
	bool C = false;
	bool V = false;

	UsedNZCV() = default;

	UsedNZCV &operator\|=(const UsedNZCV &UsedFlags) {
	this->N \|= UsedFlags.N;
	this->Z \|= UsedFlags.Z;
	this->C \|= UsedFlags.C;
	this->V \|= UsedFlags.V;
	return *this;
	}
	};

	} // end anonymous namespace

	/// Find a condition code used by the instruction.
	/// Returns AArch64CC::Invalid if either the instruction does not use condition
	/// codes or we don't optimize CmpInstr in the presence of such instructions.
	static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
	switch (Instr.getOpcode()) {
	default:
	return AArch64CC::Invalid;

	case AArch64::Bcc: {
	int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
	assert(Idx >= 2);
	return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
	}

	case AArch64::CSINVWr:
	case AArch64::CSINVXr:
	case AArch64::CSINCWr:
	case AArch64::CSINCXr:
	case AArch64::CSELWr:
	case AArch64::CSELXr:
	case AArch64::CSNEGWr:
	case AArch64::CSNEGXr:
	case AArch64::FCSELSrrr:
	case AArch64::FCSELDrrr: {
	int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
	assert(Idx >= 1);
	return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
	}
	}
	}

	static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
	assert(CC != AArch64CC::Invalid);
	UsedNZCV UsedFlags;
	switch (CC) {
	default:
	break;

	case AArch64CC::EQ: // Z set
	case AArch64CC::NE: // Z clear
	UsedFlags.Z = true;
	break;

	case AArch64CC::HI: // Z clear and C set
	case AArch64CC::LS: // Z set or C clear
	UsedFlags.Z = true;
	LLVM_FALLTHROUGH;
	case AArch64CC::HS: // C set
	case AArch64CC::LO: // C clear
	UsedFlags.C = true;
	break;

	case AArch64CC::MI: // N set
	case AArch64CC::PL: // N clear
	UsedFlags.N = true;
	break;

	case AArch64CC::VS: // V set
	case AArch64CC::VC: // V clear
	UsedFlags.V = true;
	break;

	case AArch64CC::GT: // Z clear, N and V the same
	case AArch64CC::LE: // Z set, N and V differ
	UsedFlags.Z = true;
	LLVM_FALLTHROUGH;
	case AArch64CC::GE: // N and V the same
	case AArch64CC::LT: // N and V differ
	UsedFlags.N = true;
	UsedFlags.V = true;
	break;
	}
	return UsedFlags;
	}

	static bool isADDSRegImm(unsigned Opcode) {
	return Opcode == AArch64::ADDSWri \|\| Opcode == AArch64::ADDSXri;
	}

	static bool isSUBSRegImm(unsigned Opcode) {
	return Opcode == AArch64::SUBSWri \|\| Opcode == AArch64::SUBSXri;
	}

	/// Check if CmpInstr can be substituted by MI.
	///
	/// CmpInstr can be substituted:
	/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
	/// - and, MI and CmpInstr are from the same MachineBB
	/// - and, condition flags are not alive in successors of the CmpInstr parent
	/// - and, if MI opcode is the S form there must be no defs of flags between
	/// MI and CmpInstr
	/// or if MI opcode is not the S form there must be neither defs of flags
	/// nor uses of flags between MI and CmpInstr.
	/// - and C/V flags are not used after CmpInstr
	static bool canInstrSubstituteCmpInstr(MachineInstr MI, MachineInstr CmpInstr,
	const TargetRegisterInfo *TRI) {
	assert(MI);
	assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
	assert(CmpInstr);

	const unsigned CmpOpcode = CmpInstr->getOpcode();
	if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
	return false;

	if (MI->getParent() != CmpInstr->getParent())
	return false;

	if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
	return false;

	AccessKind AccessToCheck = AK_Write;
	if (sForm(*MI) != MI->getOpcode())
	AccessToCheck = AK_All;
	if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
	return false;

	UsedNZCV NZCVUsedAfterCmp;
	for (auto I = std::next(CmpInstr->getIterator()),
	E = CmpInstr->getParent()->instr_end();
	I != E; ++I) {
	const MachineInstr &Instr = *I;
	if (Instr.readsRegister(AArch64::NZCV, TRI)) {
	AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
	if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
	return false;
	NZCVUsedAfterCmp \|= getUsedNZCV(CC);
	}

	if (Instr.modifiesRegister(AArch64::NZCV, TRI))
	break;
	}

	return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
	}

	/// Substitute an instruction comparing to zero with another instruction
	/// which produces needed condition flags.
	///
	/// Return true on success.
	bool AArch64InstrInfo::substituteCmpToZero(
	MachineInstr &CmpInstr, unsigned SrcReg,
	const MachineRegisterInfo *MRI) const {
	assert(MRI);
	// Get the unique definition of SrcReg.
	MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
	if (!MI)
	return false;

	const TargetRegisterInfo *TRI = &getRegisterInfo();

	unsigned NewOpc = sForm(*MI);
	if (NewOpc == AArch64::INSTRUCTION_LIST_END)
	return false;

	if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
	return false;

	// Update the instruction to set NZCV.
	MI->setDesc(get(NewOpc));
	CmpInstr.eraseFromParent();
	bool succeeded = UpdateOperandRegClass(*MI);
	(void)succeeded;
	assert(succeeded && "Some operands reg class are incompatible!");
	MI->addRegisterDefined(AArch64::NZCV, TRI);
	return true;
	}

	bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
	if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
	MI.getOpcode() != AArch64::CATCHRET)
	return false;

	MachineBasicBlock &MBB = *MI.getParent();
	DebugLoc DL = MI.getDebugLoc();

	if (MI.getOpcode() == AArch64::CATCHRET) {
	// Skip to the first instruction before the epilog.
	const TargetInstrInfo *TII =
	MBB.getParent()->getSubtarget().getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	auto MBBI = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
	while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
	FirstEpilogSEH != MBB.begin())
	FirstEpilogSEH = std::prev(FirstEpilogSEH);
	if (FirstEpilogSEH != MBB.begin())
	FirstEpilogSEH = std::next(FirstEpilogSEH);
	BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
	.addReg(AArch64::X0, RegState::Define)
	.addMBB(TargetMBB);
	BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
	.addReg(AArch64::X0, RegState::Define)
	.addReg(AArch64::X0)
	.addMBB(TargetMBB)
	.addImm(0);
	return true;
	}

	unsigned Reg = MI.getOperand(0).getReg();
	const GlobalValue *GV =
	cast<GlobalValue>((*MI.memoperands_begin())->getValue());
	const TargetMachine &TM = MBB.getParent()->getTarget();
	unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
	const unsigned char MO_NC = AArch64II::MO_NC;

	if ((OpFlags & AArch64II::MO_GOT) != 0) {
	BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
	.addGlobalAddress(GV, 0, OpFlags);
	BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
	.addReg(Reg, RegState::Kill)
	.addImm(0)
	.addMemOperand(*MI.memoperands_begin());
	} else if (TM.getCodeModel() == CodeModel::Large) {
	BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
	.addGlobalAddress(GV, 0, AArch64II::MO_G0 \| MO_NC)
	.addImm(0);
	BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
	.addReg(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, AArch64II::MO_G1 \| MO_NC)
	.addImm(16);
	BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
	.addReg(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, AArch64II::MO_G2 \| MO_NC)
	.addImm(32);
	BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
	.addReg(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, AArch64II::MO_G3)
	.addImm(48);
	BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
	.addReg(Reg, RegState::Kill)
	.addImm(0)
	.addMemOperand(*MI.memoperands_begin());
	} else if (TM.getCodeModel() == CodeModel::Tiny) {
	BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
	.addGlobalAddress(GV, 0, OpFlags);
	} else {
	BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
	.addGlobalAddress(GV, 0, OpFlags \| AArch64II::MO_PAGE);
	unsigned char LoFlags = OpFlags \| AArch64II::MO_PAGEOFF \| MO_NC;
	BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
	.addReg(Reg, RegState::Kill)
	.addGlobalAddress(GV, 0, LoFlags)
	.addMemOperand(*MI.memoperands_begin());
	}

	MBB.erase(MI);

	return true;
	}

	// Return true if this instruction simply sets its single destination register
	// to zero. This is equivalent to a register rename of the zero-register.
	bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::MOVZWi:
	case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
	if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
	assert(MI.getDesc().getNumOperands() == 3 &&
	MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
	return true;
	}
	break;
	case AArch64::ANDWri: // and Rd, Rzr, #imm
	return MI.getOperand(1).getReg() == AArch64::WZR;
	case AArch64::ANDXri:
	return MI.getOperand(1).getReg() == AArch64::XZR;
	case TargetOpcode::COPY:
	return MI.getOperand(1).getReg() == AArch64::WZR;
	}
	return false;
	}

	// Return true if this instruction simply renames a general register without
	// modifying bits.
	bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	break;
	case TargetOpcode::COPY: {
	// GPR32 copies will by lowered to ORRXrs
	unsigned DstReg = MI.getOperand(0).getReg();
	return (AArch64::GPR32RegClass.contains(DstReg) \|\|
	AArch64::GPR64RegClass.contains(DstReg));
	}
	case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
	if (MI.getOperand(1).getReg() == AArch64::XZR) {
	assert(MI.getDesc().getNumOperands() == 4 &&
	MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
	return true;
	}
	break;
	case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
	if (MI.getOperand(2).getImm() == 0) {
	assert(MI.getDesc().getNumOperands() == 4 &&
	MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
	return true;
	}
	break;
	}
	return false;
	}

	// Return true if this instruction simply renames a general register without
	// modifying bits.
	bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	break;
	case TargetOpcode::COPY: {
	// FPR64 copies will by lowered to ORR.16b
	unsigned DstReg = MI.getOperand(0).getReg();
	return (AArch64::FPR64RegClass.contains(DstReg) \|\|
	AArch64::FPR128RegClass.contains(DstReg));
	}
	case AArch64::ORRv16i8:
	if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
	assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
	"invalid ORRv16i8 operands");
	return true;
	}
	break;
	}
	return false;
	}

	unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::LDRWui:
	case AArch64::LDRXui:
	case AArch64::LDRBui:
	case AArch64::LDRHui:
	case AArch64::LDRSui:
	case AArch64::LDRDui:
	case AArch64::LDRQui:
	if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
	MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
	FrameIndex = MI.getOperand(1).getIndex();
	return MI.getOperand(0).getReg();
	}
	break;
	}

	return 0;
	}

	unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::STRWui:
	case AArch64::STRXui:
	case AArch64::STRBui:
	case AArch64::STRHui:
	case AArch64::STRSui:
	case AArch64::STRDui:
	case AArch64::STRQui:
	if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
	MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
	FrameIndex = MI.getOperand(1).getIndex();
	return MI.getOperand(0).getReg();
	}
	break;
	}
	return 0;
	}

	/// Check all MachineMemOperands for a hint to suppress pairing.
	bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
	return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
	return MMO->getFlags() & MOSuppressPair;
	});
	}

	/// Set a flag on the first MachineMemOperand to suppress pairing.
	void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
	if (MI.memoperands_empty())
	return;
	(*MI.memoperands_begin())->setFlags(MOSuppressPair);
	}

	/// Check all MachineMemOperands for a hint that the load/store is strided.
	bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
	return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
	return MMO->getFlags() & MOStridedAccess;
	});
	}

	bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
	switch (Opc) {
	default:
	return false;
	case AArch64::STURSi:
	case AArch64::STURDi:
	case AArch64::STURQi:
	case AArch64::STURBBi:
	case AArch64::STURHHi:
	case AArch64::STURWi:
	case AArch64::STURXi:
	case AArch64::LDURSi:
	case AArch64::LDURDi:
	case AArch64::LDURQi:
	case AArch64::LDURWi:
	case AArch64::LDURXi:
	case AArch64::LDURSWi:
	case AArch64::LDURHHi:
	case AArch64::LDURBBi:
	case AArch64::LDURSBWi:
	case AArch64::LDURSHWi:
	return true;
	}
	}

	Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
	switch (Opc) {
	default: return {};
	case AArch64::PRFMui: return AArch64::PRFUMi;
	case AArch64::LDRXui: return AArch64::LDURXi;
	case AArch64::LDRWui: return AArch64::LDURWi;
	case AArch64::LDRBui: return AArch64::LDURBi;
	case AArch64::LDRHui: return AArch64::LDURHi;
	case AArch64::LDRSui: return AArch64::LDURSi;
	case AArch64::LDRDui: return AArch64::LDURDi;
	case AArch64::LDRQui: return AArch64::LDURQi;
	case AArch64::LDRBBui: return AArch64::LDURBBi;
	case AArch64::LDRHHui: return AArch64::LDURHHi;
	case AArch64::LDRSBXui: return AArch64::LDURSBXi;
	case AArch64::LDRSBWui: return AArch64::LDURSBWi;
	case AArch64::LDRSHXui: return AArch64::LDURSHXi;
	case AArch64::LDRSHWui: return AArch64::LDURSHWi;
	case AArch64::LDRSWui: return AArch64::LDURSWi;
	case AArch64::STRXui: return AArch64::STURXi;
	case AArch64::STRWui: return AArch64::STURWi;
	case AArch64::STRBui: return AArch64::STURBi;
	case AArch64::STRHui: return AArch64::STURHi;
	case AArch64::STRSui: return AArch64::STURSi;
	case AArch64::STRDui: return AArch64::STURDi;
	case AArch64::STRQui: return AArch64::STURQi;
	case AArch64::STRBBui: return AArch64::STURBBi;
	case AArch64::STRHHui: return AArch64::STURHHi;
	}
	}

	unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
	switch (Opc) {
	default:
	return 2;
	case AArch64::LDPXi:
	case AArch64::LDPDi:
	case AArch64::STPXi:
	case AArch64::STPDi:
	case AArch64::LDNPXi:
	case AArch64::LDNPDi:
	case AArch64::STNPXi:
	case AArch64::STNPDi:
	case AArch64::LDPQi:
	case AArch64::STPQi:
	case AArch64::LDNPQi:
	case AArch64::STNPQi:
	case AArch64::LDPWi:
	case AArch64::LDPSi:
	case AArch64::STPWi:
	case AArch64::STPSi:
	case AArch64::LDNPWi:
	case AArch64::LDNPSi:
	case AArch64::STNPWi:
	case AArch64::STNPSi:
	case AArch64::LDG:
	case AArch64::STGPi:
	return 3;
	case AArch64::ADDG:
	case AArch64::STGOffset:
	return 2;
	}
	}

	bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;
	// Scaled instructions.
	case AArch64::STRSui:
	case AArch64::STRDui:
	case AArch64::STRQui:
	case AArch64::STRXui:
	case AArch64::STRWui:
	case AArch64::LDRSui:
	case AArch64::LDRDui:
	case AArch64::LDRQui:
	case AArch64::LDRXui:
	case AArch64::LDRWui:
	case AArch64::LDRSWui:
	// Unscaled instructions.
	case AArch64::STURSi:
	case AArch64::STURDi:
	case AArch64::STURQi:
	case AArch64::STURWi:
	case AArch64::STURXi:
	case AArch64::LDURSi:
	case AArch64::LDURDi:
	case AArch64::LDURQi:
	case AArch64::LDURWi:
	case AArch64::LDURXi:
	case AArch64::LDURSWi:
	return true;
	}
	}

	unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
	bool &Is64Bit) {
	switch (Opc) {
	default:
	llvm_unreachable("Opcode has no flag setting equivalent!");
	// 32-bit cases:
	case AArch64::ADDWri:
	Is64Bit = false;
	return AArch64::ADDSWri;
	case AArch64::ADDWrr:
	Is64Bit = false;
	return AArch64::ADDSWrr;
	case AArch64::ADDWrs:
	Is64Bit = false;
	return AArch64::ADDSWrs;
	case AArch64::ADDWrx:
	Is64Bit = false;
	return AArch64::ADDSWrx;
	case AArch64::ANDWri:
	Is64Bit = false;
	return AArch64::ANDSWri;
	case AArch64::ANDWrr:
	Is64Bit = false;
	return AArch64::ANDSWrr;
	case AArch64::ANDWrs:
	Is64Bit = false;
	return AArch64::ANDSWrs;
	case AArch64::BICWrr:
	Is64Bit = false;
	return AArch64::BICSWrr;
	case AArch64::BICWrs:
	Is64Bit = false;
	return AArch64::BICSWrs;
	case AArch64::SUBWri:
	Is64Bit = false;
	return AArch64::SUBSWri;
	case AArch64::SUBWrr:
	Is64Bit = false;
	return AArch64::SUBSWrr;
	case AArch64::SUBWrs:
	Is64Bit = false;
	return AArch64::SUBSWrs;
	case AArch64::SUBWrx:
	Is64Bit = false;
	return AArch64::SUBSWrx;
	// 64-bit cases:
	case AArch64::ADDXri:
	Is64Bit = true;
	return AArch64::ADDSXri;
	case AArch64::ADDXrr:
	Is64Bit = true;
	return AArch64::ADDSXrr;
	case AArch64::ADDXrs:
	Is64Bit = true;
	return AArch64::ADDSXrs;
	case AArch64::ADDXrx:
	Is64Bit = true;
	return AArch64::ADDSXrx;
	case AArch64::ANDXri:
	Is64Bit = true;
	return AArch64::ANDSXri;
	case AArch64::ANDXrr:
	Is64Bit = true;
	return AArch64::ANDSXrr;
	case AArch64::ANDXrs:
	Is64Bit = true;
	return AArch64::ANDSXrs;
	case AArch64::BICXrr:
	Is64Bit = true;
	return AArch64::BICSXrr;
	case AArch64::BICXrs:
	Is64Bit = true;
	return AArch64::BICSXrs;
	case AArch64::SUBXri:
	Is64Bit = true;
	return AArch64::SUBSXri;
	case AArch64::SUBXrr:
	Is64Bit = true;
	return AArch64::SUBSXrr;
	case AArch64::SUBXrs:
	Is64Bit = true;
	return AArch64::SUBSXrs;
	case AArch64::SUBXrx:
	Is64Bit = true;
	return AArch64::SUBSXrx;
	}
	}

	// Is this a candidate for ld/st merging or pairing? For example, we don't
	// touch volatiles or load/stores that have a hint to avoid pair formation.
	bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
	// If this is a volatile load/store, don't mess with it.
	if (MI.hasOrderedMemoryRef())
	return false;

	// Make sure this is a reg/fi+imm (as opposed to an address reloc).
	assert((MI.getOperand(1).isReg() \|\| MI.getOperand(1).isFI()) &&
	"Expected a reg or frame index operand.");
	if (!MI.getOperand(2).isImm())
	return false;

	// Can't merge/pair if the instruction modifies the base register.
	// e.g., ldr x0, [x0]
	// This case will never occur with an FI base.
	if (MI.getOperand(1).isReg()) {
	unsigned BaseReg = MI.getOperand(1).getReg();
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	if (MI.modifiesRegister(BaseReg, TRI))
	return false;
	}

	// Check if this load/store has a hint to avoid pair formation.
	// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
	if (isLdStPairSuppressed(MI))
	+ return false;
	+
	+ // Do not pair any callee-save store/reload instructions in the
	+ // prologue/epilogue if the CFI information encoded the operations as separate
	+ // instructions, as that will cause the size of the actual prologue to mismatch
	+ // with the prologue size recorded in the Windows CFI.
	+ const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
	+ bool NeedsWinCFI = MAI->usesWindowsCFI() &&
	+ MI.getMF()->getFunction().needsUnwindTableEntry();
	+ if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) \|\|
	+ MI.getFlag(MachineInstr::FrameDestroy)))
	return false;

	// On some CPUs quad load/store pairs are slower than two single load/stores.
	if (Subtarget.isPaired128Slow()) {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::LDURQi:
	case AArch64::STURQi:
	case AArch64::LDRQui:
	case AArch64::STRQui:
	return false;
	}
	}

	return true;
	}

	bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
	const MachineOperand *&BaseOp,
	int64_t &Offset,
	const TargetRegisterInfo *TRI) const {
	unsigned Width;
	return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
	}

	bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
	const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
	unsigned &Width, const TargetRegisterInfo *TRI) const {
	assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
	// Handle only loads/stores with base register followed by immediate offset.
	if (LdSt.getNumExplicitOperands() == 3) {
	// Non-paired instruction (e.g., ldr x1, [x0, #8]).
	if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) \|\|
	!LdSt.getOperand(2).isImm())
	return false;
	} else if (LdSt.getNumExplicitOperands() == 4) {
	// Paired instruction (e.g., ldp x1, x2, [x0, #8]).
	if (!LdSt.getOperand(1).isReg() \|\|
	(!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) \|\|
	!LdSt.getOperand(3).isImm())
	return false;
	} else
	return false;

	// Get the scaling factor for the instruction and set the width for the
	// instruction.
	unsigned Scale = 0;
	int64_t Dummy1, Dummy2;

	// If this returns false, then it's an instruction we don't want to handle.
	if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
	return false;

	// Compute the offset. Offset is calculated as the immediate operand
	// multiplied by the scaling factor. Unscaled instructions have scaling factor
	// set to 1.
	if (LdSt.getNumExplicitOperands() == 3) {
	BaseOp = &LdSt.getOperand(1);
	Offset = LdSt.getOperand(2).getImm() * Scale;
	} else {
	assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
	BaseOp = &LdSt.getOperand(2);
	Offset = LdSt.getOperand(3).getImm() * Scale;
	}

	assert((BaseOp->isReg() \|\| BaseOp->isFI()) &&
	"getMemOperandWithOffset only supports base "
	"operands of type register or frame index.");

	return true;
	}

	MachineOperand &
	AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
	assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
	MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
	assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
	return OfsOp;
	}

	bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
	unsigned &Width, int64_t &MinOffset,
	int64_t &MaxOffset) {
	switch (Opcode) {
	// Not a memory operation or something we want to handle.
	default:
	Scale = Width = 0;
	MinOffset = MaxOffset = 0;
	return false;
	case AArch64::STRWpost:
	case AArch64::LDRWpost:
	Width = 32;
	Scale = 4;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDURQi:
	case AArch64::STURQi:
	Width = 16;
	Scale = 1;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::PRFUMi:
	case AArch64::LDURXi:
	case AArch64::LDURDi:
	case AArch64::STURXi:
	case AArch64::STURDi:
	Width = 8;
	Scale = 1;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDURWi:
	case AArch64::LDURSi:
	case AArch64::LDURSWi:
	case AArch64::STURWi:
	case AArch64::STURSi:
	Width = 4;
	Scale = 1;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDURHi:
	case AArch64::LDURHHi:
	case AArch64::LDURSHXi:
	case AArch64::LDURSHWi:
	case AArch64::STURHi:
	case AArch64::STURHHi:
	Width = 2;
	Scale = 1;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDURBi:
	case AArch64::LDURBBi:
	case AArch64::LDURSBXi:
	case AArch64::LDURSBWi:
	case AArch64::STURBi:
	case AArch64::STURBBi:
	Width = 1;
	Scale = 1;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::LDPQi:
	case AArch64::LDNPQi:
	case AArch64::STPQi:
	case AArch64::STNPQi:
	Scale = 16;
	Width = 32;
	MinOffset = -64;
	MaxOffset = 63;
	break;
	case AArch64::LDRQui:
	case AArch64::STRQui:
	Scale = Width = 16;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::LDPXi:
	case AArch64::LDPDi:
	case AArch64::LDNPXi:
	case AArch64::LDNPDi:
	case AArch64::STPXi:
	case AArch64::STPDi:
	case AArch64::STNPXi:
	case AArch64::STNPDi:
	Scale = 8;
	Width = 16;
	MinOffset = -64;
	MaxOffset = 63;
	break;
	case AArch64::PRFMui:
	case AArch64::LDRXui:
	case AArch64::LDRDui:
	case AArch64::STRXui:
	case AArch64::STRDui:
	Scale = Width = 8;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::LDPWi:
	case AArch64::LDPSi:
	case AArch64::LDNPWi:
	case AArch64::LDNPSi:
	case AArch64::STPWi:
	case AArch64::STPSi:
	case AArch64::STNPWi:
	case AArch64::STNPSi:
	Scale = 4;
	Width = 8;
	MinOffset = -64;
	MaxOffset = 63;
	break;
	case AArch64::LDRWui:
	case AArch64::LDRSui:
	case AArch64::LDRSWui:
	case AArch64::STRWui:
	case AArch64::STRSui:
	Scale = Width = 4;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::LDRHui:
	case AArch64::LDRHHui:
	case AArch64::LDRSHWui:
	case AArch64::LDRSHXui:
	case AArch64::STRHui:
	case AArch64::STRHHui:
	Scale = Width = 2;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::LDRBui:
	case AArch64::LDRBBui:
	case AArch64::LDRSBWui:
	case AArch64::LDRSBXui:
	case AArch64::STRBui:
	case AArch64::STRBBui:
	Scale = Width = 1;
	MinOffset = 0;
	MaxOffset = 4095;
	break;
	case AArch64::ADDG:
	case AArch64::TAGPstack:
	Scale = 16;
	Width = 0;
	MinOffset = 0;
	MaxOffset = 63;
	break;
	case AArch64::LDG:
	case AArch64::STGOffset:
	case AArch64::STZGOffset:
	Scale = Width = 16;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::ST2GOffset:
	case AArch64::STZ2GOffset:
	Scale = 16;
	Width = 32;
	MinOffset = -256;
	MaxOffset = 255;
	break;
	case AArch64::STGPi:
	Scale = Width = 16;
	MinOffset = -64;
	MaxOffset = 63;
	break;
	}

	return true;
	}

	static unsigned getOffsetStride(unsigned Opc) {
	switch (Opc) {
	default:
	return 0;
	case AArch64::LDURQi:
	case AArch64::STURQi:
	return 16;
	case AArch64::LDURXi:
	case AArch64::LDURDi:
	case AArch64::STURXi:
	case AArch64::STURDi:
	return 8;
	case AArch64::LDURWi:
	case AArch64::LDURSi:
	case AArch64::LDURSWi:
	case AArch64::STURWi:
	case AArch64::STURSi:
	return 4;
	}
	}

	// Scale the unscaled offsets. Returns false if the unscaled offset can't be
	// scaled.
	static bool scaleOffset(unsigned Opc, int64_t &Offset) {
	unsigned OffsetStride = getOffsetStride(Opc);
	if (OffsetStride == 0)
	return false;
	// If the byte-offset isn't a multiple of the stride, we can't scale this
	// offset.
	if (Offset % OffsetStride != 0)
	return false;

	// Convert the byte-offset used by unscaled into an "element" offset used
	// by the scaled pair load/store instructions.
	Offset /= OffsetStride;
	return true;
	}

	// Unscale the scaled offsets. Returns false if the scaled offset can't be
	// unscaled.
	static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
	unsigned OffsetStride = getOffsetStride(Opc);
	if (OffsetStride == 0)
	return false;

	// Convert the "element" offset used by scaled pair load/store instructions
	// into the byte-offset used by unscaled.
	Offset *= OffsetStride;
	return true;
	}

	static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
	if (FirstOpc == SecondOpc)
	return true;
	// We can also pair sign-ext and zero-ext instructions.
	switch (FirstOpc) {
	default:
	return false;
	case AArch64::LDRWui:
	case AArch64::LDURWi:
	return SecondOpc == AArch64::LDRSWui \|\| SecondOpc == AArch64::LDURSWi;
	case AArch64::LDRSWui:
	case AArch64::LDURSWi:
	return SecondOpc == AArch64::LDRWui \|\| SecondOpc == AArch64::LDURWi;
	}
	// These instructions can't be paired based on their opcodes.
	return false;
	}

	static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
	int64_t Offset1, unsigned Opcode1, int FI2,
	int64_t Offset2, unsigned Opcode2) {
	// Accesses through fixed stack object frame indices may access a different
	// fixed stack slot. Check that the object offsets + offsets match.
	if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
	int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
	int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
	assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
	// Get the byte-offset from the object offset.
	if (!unscaleOffset(Opcode1, Offset1) \|\| !unscaleOffset(Opcode2, Offset2))
	return false;
	ObjectOffset1 += Offset1;
	ObjectOffset2 += Offset2;
	// Get the "element" index in the object.
	if (!scaleOffset(Opcode1, ObjectOffset1) \|\|
	!scaleOffset(Opcode2, ObjectOffset2))
	return false;
	return ObjectOffset1 + 1 == ObjectOffset2;
	}

	return FI1 == FI2;
	}

	/// Detect opportunities for ldp/stp formation.
	///
	/// Only called for LdSt for which getMemOperandWithOffset returns true.
	bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
	const MachineOperand &BaseOp2,
	unsigned NumLoads) const {
	const MachineInstr &FirstLdSt = *BaseOp1.getParent();
	const MachineInstr &SecondLdSt = *BaseOp2.getParent();
	if (BaseOp1.getType() != BaseOp2.getType())
	return false;

	assert((BaseOp1.isReg() \|\| BaseOp1.isFI()) &&
	"Only base registers and frame indices are supported.");

	// Check for both base regs and base FI.
	if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
	return false;

	// Only cluster up to a single pair.
	if (NumLoads > 1)
	return false;

	if (!isPairableLdStInst(FirstLdSt) \|\| !isPairableLdStInst(SecondLdSt))
	return false;

	// Can we pair these instructions based on their opcodes?
	unsigned FirstOpc = FirstLdSt.getOpcode();
	unsigned SecondOpc = SecondLdSt.getOpcode();
	if (!canPairLdStOpc(FirstOpc, SecondOpc))
	return false;

	// Can't merge volatiles or load/stores that have a hint to avoid pair
	// formation, for example.
	if (!isCandidateToMergeOrPair(FirstLdSt) \|\|
	!isCandidateToMergeOrPair(SecondLdSt))
	return false;

	// isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
	int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
	if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
	return false;

	int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
	if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
	return false;

	// Pairwise instructions have a 7-bit signed offset field.
	if (Offset1 > 63 \|\| Offset1 < -64)
	return false;

	// The caller should already have ordered First/SecondLdSt by offset.
	// Note: except for non-equal frame index bases
	if (BaseOp1.isFI()) {
	assert((!BaseOp1.isIdenticalTo(BaseOp2) \|\| Offset1 >= Offset2) &&
	"Caller should have ordered offsets.");

	const MachineFrameInfo &MFI =
	FirstLdSt.getParent()->getParent()->getFrameInfo();
	return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
	BaseOp2.getIndex(), Offset2, SecondOpc);
	}

	assert((!BaseOp1.isIdenticalTo(BaseOp2) \|\| Offset1 <= Offset2) &&
	"Caller should have ordered offsets.");

	return Offset1 + 1 == Offset2;
	}

	static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
	unsigned Reg, unsigned SubIdx,
	unsigned State,
	const TargetRegisterInfo *TRI) {
	if (!SubIdx)
	return MIB.addReg(Reg, State);

	if (TargetRegisterInfo::isPhysicalRegister(Reg))
	return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
	return MIB.addReg(Reg, State, SubIdx);
	}

	static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
	unsigned NumRegs) {
	// We really want the positive remainder mod 32 here, that happens to be
	// easily obtainable with a mask.
	return ((DestReg - SrcReg) & 0x1f) < NumRegs;
	}

	void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, unsigned DestReg,
	unsigned SrcReg, bool KillSrc,
	unsigned Opcode,
	ArrayRef<unsigned> Indices) const {
	assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
	uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
	unsigned NumRegs = Indices.size();

	int SubReg = 0, End = NumRegs, Incr = 1;
	if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
	SubReg = NumRegs - 1;
	End = -1;
	Incr = -1;
	}

	for (; SubReg != End; SubReg += Incr) {
	const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
	AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
	AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
	AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
	}
	}

	void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	DebugLoc DL, unsigned DestReg,
	unsigned SrcReg, bool KillSrc,
	unsigned Opcode, unsigned ZeroReg,
	llvm::ArrayRef<unsigned> Indices) const {
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	unsigned NumRegs = Indices.size();

	#ifndef NDEBUG
	uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
	uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
	assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
	"GPR reg sequences should not be able to overlap");
	#endif

	for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
	const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
	AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
	MIB.addReg(ZeroReg);
	AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
	MIB.addImm(0);
	}
	}

	void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, unsigned DestReg,
	unsigned SrcReg, bool KillSrc) const {
	if (AArch64::GPR32spRegClass.contains(DestReg) &&
	(AArch64::GPR32spRegClass.contains(SrcReg) \|\| SrcReg == AArch64::WZR)) {
	const TargetRegisterInfo *TRI = &getRegisterInfo();

	if (DestReg == AArch64::WSP \|\| SrcReg == AArch64::WSP) {
	// If either operand is WSP, expand to ADD #0.
	if (Subtarget.hasZeroCycleRegMove()) {
	// Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
	unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
	&AArch64::GPR64spRegClass);
	unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
	&AArch64::GPR64spRegClass);
	// This instruction is reading and writing X registers. This may upset
	// the register scavenger and machine verifier, so we need to indicate
	// that we are reading an undefined value from SrcRegX, but a proper
	// value from SrcReg.
	BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
	.addReg(SrcRegX, RegState::Undef)
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
	.addReg(SrcReg, RegState::Implicit \| getKillRegState(KillSrc));
	} else {
	BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	}
	} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
	BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	} else {
	if (Subtarget.hasZeroCycleRegMove()) {
	// Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
	unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
	&AArch64::GPR64spRegClass);
	unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
	&AArch64::GPR64spRegClass);
	// This instruction is reading and writing X registers. This may upset
	// the register scavenger and machine verifier, so we need to indicate
	// that we are reading an undefined value from SrcRegX, but a proper
	// value from SrcReg.
	BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
	.addReg(AArch64::XZR)
	.addReg(SrcRegX, RegState::Undef)
	.addReg(SrcReg, RegState::Implicit \| getKillRegState(KillSrc));
	} else {
	// Otherwise, expand to ORR WZR.
	BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
	.addReg(AArch64::WZR)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	}
	return;
	}

	if (AArch64::GPR64spRegClass.contains(DestReg) &&
	(AArch64::GPR64spRegClass.contains(SrcReg) \|\| SrcReg == AArch64::XZR)) {
	if (DestReg == AArch64::SP \|\| SrcReg == AArch64::SP) {
	// If either operand is SP, expand to ADD #0.
	BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
	BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
	.addImm(0)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	} else {
	// Otherwise, expand to ORR XZR.
	BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
	.addReg(AArch64::XZR)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	// Copy a DDDD register quad by copying the individual sub-registers.
	if (AArch64::DDDDRegClass.contains(DestReg) &&
	AArch64::DDDDRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
	AArch64::dsub2, AArch64::dsub3};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
	Indices);
	return;
	}

	// Copy a DDD register triple by copying the individual sub-registers.
	if (AArch64::DDDRegClass.contains(DestReg) &&
	AArch64::DDDRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
	AArch64::dsub2};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
	Indices);
	return;
	}

	// Copy a DD register pair by copying the individual sub-registers.
	if (AArch64::DDRegClass.contains(DestReg) &&
	AArch64::DDRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
	Indices);
	return;
	}

	// Copy a QQQQ register quad by copying the individual sub-registers.
	if (AArch64::QQQQRegClass.contains(DestReg) &&
	AArch64::QQQQRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
	AArch64::qsub2, AArch64::qsub3};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
	Indices);
	return;
	}

	// Copy a QQQ register triple by copying the individual sub-registers.
	if (AArch64::QQQRegClass.contains(DestReg) &&
	AArch64::QQQRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
	AArch64::qsub2};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
	Indices);
	return;
	}

	// Copy a QQ register pair by copying the individual sub-registers.
	if (AArch64::QQRegClass.contains(DestReg) &&
	AArch64::QQRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
	copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
	Indices);
	return;
	}

	if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
	AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
	copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
	AArch64::XZR, Indices);
	return;
	}

	if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
	AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
	static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
	copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
	AArch64::WZR, Indices);
	return;
	}

	if (AArch64::FPR128RegClass.contains(DestReg) &&
	AArch64::FPR128RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	BuildMI(MBB, I, DL, get(AArch64::STRQpre))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addReg(AArch64::SP)
	.addImm(-16);
	BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(DestReg, RegState::Define)
	.addReg(AArch64::SP)
	.addImm(16);
	}
	return;
	}

	if (AArch64::FPR64RegClass.contains(DestReg) &&
	AArch64::FPR64RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
	&AArch64::FPR128RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
	&AArch64::FPR128RegClass);
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	if (AArch64::FPR32RegClass.contains(DestReg) &&
	AArch64::FPR32RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
	&AArch64::FPR128RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
	&AArch64::FPR128RegClass);
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	if (AArch64::FPR16RegClass.contains(DestReg) &&
	AArch64::FPR16RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
	&AArch64::FPR128RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
	&AArch64::FPR128RegClass);
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
	&AArch64::FPR32RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
	&AArch64::FPR32RegClass);
	BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	if (AArch64::FPR8RegClass.contains(DestReg) &&
	AArch64::FPR8RegClass.contains(SrcReg)) {
	if (Subtarget.hasNEON()) {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
	&AArch64::FPR128RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
	&AArch64::FPR128RegClass);
	BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
	.addReg(SrcReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	} else {
	DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
	&AArch64::FPR32RegClass);
	SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
	&AArch64::FPR32RegClass);
	BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	}
	return;
	}

	// Copies between GPR64 and FPR64.
	if (AArch64::FPR64RegClass.contains(DestReg) &&
	AArch64::GPR64RegClass.contains(SrcReg)) {
	BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}
	if (AArch64::GPR64RegClass.contains(DestReg) &&
	AArch64::FPR64RegClass.contains(SrcReg)) {
	BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}
	// Copies between GPR32 and FPR32.
	if (AArch64::FPR32RegClass.contains(DestReg) &&
	AArch64::GPR32RegClass.contains(SrcReg)) {
	BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}
	if (AArch64::GPR32RegClass.contains(DestReg) &&
	AArch64::FPR32RegClass.contains(SrcReg)) {
	BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}

	if (DestReg == AArch64::NZCV) {
	assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
	BuildMI(MBB, I, DL, get(AArch64::MSR))
	.addImm(AArch64SysReg::NZCV)
	.addReg(SrcReg, getKillRegState(KillSrc))
	.addReg(AArch64::NZCV, RegState::Implicit \| RegState::Define);
	return;
	}

	if (SrcReg == AArch64::NZCV) {
	assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
	BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
	.addImm(AArch64SysReg::NZCV)
	.addReg(AArch64::NZCV, RegState::Implicit \| getKillRegState(KillSrc));
	return;
	}

	llvm_unreachable("unimplemented reg-to-reg copy");
	}

	static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator InsertBefore,
	const MCInstrDesc &MCID,
	unsigned SrcReg, bool IsKill,
	unsigned SubIdx0, unsigned SubIdx1, int FI,
	MachineMemOperand *MMO) {
	unsigned SrcReg0 = SrcReg;
	unsigned SrcReg1 = SrcReg;
	if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
	SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
	SubIdx0 = 0;
	SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
	SubIdx1 = 0;
	}
	BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
	.addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
	.addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	}

	void AArch64InstrInfo::storeRegToStackSlot(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
	bool isKill, int FI, const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	unsigned Align = MFI.getObjectAlignment(FI);

	MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
	unsigned Opc = 0;
	bool Offset = true;
	switch (TRI->getSpillSize(*RC)) {
	case 1:
	if (AArch64::FPR8RegClass.hasSubClassEq(RC))
	Opc = AArch64::STRBui;
	break;
	case 2:
	if (AArch64::FPR16RegClass.hasSubClassEq(RC))
	Opc = AArch64::STRHui;
	break;
	case 4:
	if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
	Opc = AArch64::STRWui;
	if (TargetRegisterInfo::isVirtualRegister(SrcReg))
	MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
	else
	assert(SrcReg != AArch64::WSP);
	} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
	Opc = AArch64::STRSui;
	break;
	case 8:
	if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
	Opc = AArch64::STRXui;
	if (TargetRegisterInfo::isVirtualRegister(SrcReg))
	MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
	else
	assert(SrcReg != AArch64::SP);
	} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
	Opc = AArch64::STRDui;
	} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
	storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
	get(AArch64::STPWi), SrcReg, isKill,
	AArch64::sube32, AArch64::subo32, FI, MMO);
	return;
	}
	break;
	case 16:
	if (AArch64::FPR128RegClass.hasSubClassEq(RC))
	Opc = AArch64::STRQui;
	else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Twov1d;
	Offset = false;
	} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
	storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
	get(AArch64::STPXi), SrcReg, isKill,
	AArch64::sube64, AArch64::subo64, FI, MMO);
	return;
	}
	break;
	case 24:
	if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Threev1d;
	Offset = false;
	}
	break;
	case 32:
	if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Fourv1d;
	Offset = false;
	} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Twov2d;
	Offset = false;
	}
	break;
	case 48:
	if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Threev2d;
	Offset = false;
	}
	break;
	case 64:
	if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
	Opc = AArch64::ST1Fourv2d;
	Offset = false;
	}
	break;
	}
	assert(Opc && "Unknown register class");

	const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
	.addReg(SrcReg, getKillRegState(isKill))
	.addFrameIndex(FI);

	if (Offset)
	MI.addImm(0);
	MI.addMemOperand(MMO);
	}

	static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator InsertBefore,
	const MCInstrDesc &MCID,
	unsigned DestReg, unsigned SubIdx0,
	unsigned SubIdx1, int FI,
	MachineMemOperand *MMO) {
	unsigned DestReg0 = DestReg;
	unsigned DestReg1 = DestReg;
	bool IsUndef = true;
	if (TargetRegisterInfo::isPhysicalRegister(DestReg)) {
	DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
	SubIdx0 = 0;
	DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
	SubIdx1 = 0;
	IsUndef = false;
	}
	BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
	.addReg(DestReg0, RegState::Define \| getUndefRegState(IsUndef), SubIdx0)
	.addReg(DestReg1, RegState::Define \| getUndefRegState(IsUndef), SubIdx1)
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	}

	void AArch64InstrInfo::loadRegFromStackSlot(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
	int FI, const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	unsigned Align = MFI.getObjectAlignment(FI);
	MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);

	unsigned Opc = 0;
	bool Offset = true;
	switch (TRI->getSpillSize(*RC)) {
	case 1:
	if (AArch64::FPR8RegClass.hasSubClassEq(RC))
	Opc = AArch64::LDRBui;
	break;
	case 2:
	if (AArch64::FPR16RegClass.hasSubClassEq(RC))
	Opc = AArch64::LDRHui;
	break;
	case 4:
	if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
	Opc = AArch64::LDRWui;
	if (TargetRegisterInfo::isVirtualRegister(DestReg))
	MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
	else
	assert(DestReg != AArch64::WSP);
	} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
	Opc = AArch64::LDRSui;
	break;
	case 8:
	if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
	Opc = AArch64::LDRXui;
	if (TargetRegisterInfo::isVirtualRegister(DestReg))
	MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
	else
	assert(DestReg != AArch64::SP);
	} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
	Opc = AArch64::LDRDui;
	} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
	loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
	get(AArch64::LDPWi), DestReg, AArch64::sube32,
	AArch64::subo32, FI, MMO);
	return;
	}
	break;
	case 16:
	if (AArch64::FPR128RegClass.hasSubClassEq(RC))
	Opc = AArch64::LDRQui;
	else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Twov1d;
	Offset = false;
	} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
	loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
	get(AArch64::LDPXi), DestReg, AArch64::sube64,
	AArch64::subo64, FI, MMO);
	return;
	}
	break;
	case 24:
	if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Threev1d;
	Offset = false;
	}
	break;
	case 32:
	if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Fourv1d;
	Offset = false;
	} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Twov2d;
	Offset = false;
	}
	break;
	case 48:
	if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Threev2d;
	Offset = false;
	}
	break;
	case 64:
	if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
	assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
	Opc = AArch64::LD1Fourv2d;
	Offset = false;
	}
	break;
	}
	assert(Opc && "Unknown register class");

	const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
	.addReg(DestReg, getDefRegState(true))
	.addFrameIndex(FI);
	if (Offset)
	MI.addImm(0);
	MI.addMemOperand(MMO);
	}

	void llvm::emitFrameOffset(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
	unsigned DestReg, unsigned SrcReg, int Offset,
	const TargetInstrInfo *TII,
	MachineInstr::MIFlag Flag, bool SetNZCV,
	bool NeedsWinCFI, bool *HasWinCFI) {
	if (DestReg == SrcReg && Offset == 0)
	return;

	assert((DestReg != AArch64::SP \|\| Offset % 16 == 0) &&
	"SP increment/decrement not 16-byte aligned");

	bool isSub = Offset < 0;
	if (isSub)
	Offset = -Offset;

	// FIXME: If the offset won't fit in 24-bits, compute the offset into a
	// scratch register. If DestReg is a virtual register, use it as the
	// scratch register; otherwise, create a new virtual register (to be
	// replaced by the scavenger at the end of PEI). That case can be optimized
	// slightly if DestReg is SP which is always 16-byte aligned, so the scratch
	// register can be loaded with offset%8 and the add/sub can use an extending
	// instruction with LSL#3.
	// Currently the function handles any offsets but generates a poor sequence
	// of code.
	// assert(Offset < (1 << 24) && "unimplemented reg plus immediate");

	unsigned Opc;
	if (SetNZCV)
	Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
	else
	Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
	const unsigned MaxEncoding = 0xfff;
	const unsigned ShiftSize = 12;
	const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
	while (((unsigned)Offset) >= (1 << ShiftSize)) {
	unsigned ThisVal;
	if (((unsigned)Offset) > MaxEncodableValue) {
	ThisVal = MaxEncodableValue;
	} else {
	ThisVal = Offset & MaxEncodableValue;
	}
	assert((ThisVal >> ShiftSize) <= MaxEncoding &&
	"Encoding cannot handle value that big");
	BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
	.addReg(SrcReg)
	.addImm(ThisVal >> ShiftSize)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
	.setMIFlag(Flag);

	if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) {
	if (HasWinCFI)
	*HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
	.addImm(ThisVal)
	.setMIFlag(Flag);
	}

	SrcReg = DestReg;
	Offset -= ThisVal;
	if (Offset == 0)
	return;
	}
	BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
	.addReg(SrcReg)
	.addImm(Offset)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
	.setMIFlag(Flag);

	if (NeedsWinCFI) {
	if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) \|\|
	(SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
	if (HasWinCFI)
	*HasWinCFI = true;
	if (Offset == 0)
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
	setMIFlag(Flag);
	else
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
	addImm(Offset).setMIFlag(Flag);
	} else if (DestReg == AArch64::SP) {
	if (HasWinCFI)
	*HasWinCFI = true;
	BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
	addImm(Offset).setMIFlag(Flag);
	}
	}
	}

	MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
	MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, int FrameIndex,
	LiveIntervals LIS, VirtRegMap VRM) const {
	// This is a bit of a hack. Consider this instruction:
	//
	// %0 = COPY %sp; GPR64all:%0
	//
	// We explicitly chose GPR64all for the virtual register so such a copy might
	// be eliminated by RegisterCoalescer. However, that may not be possible, and
	// %0 may even spill. We can't spill %sp, and since it is in the GPR64all
	// register class, TargetInstrInfo::foldMemoryOperand() is going to try.
	//
	// To prevent that, we are going to constrain the %0 register class here.
	//
	// <rdar://problem/11522048>
	//
	if (MI.isFullCopy()) {
	unsigned DstReg = MI.getOperand(0).getReg();
	unsigned SrcReg = MI.getOperand(1).getReg();
	if (SrcReg == AArch64::SP &&
	TargetRegisterInfo::isVirtualRegister(DstReg)) {
	MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
	return nullptr;
	}
	if (DstReg == AArch64::SP &&
	TargetRegisterInfo::isVirtualRegister(SrcReg)) {
	MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
	return nullptr;
	}
	}

	// Handle the case where a copy is being spilled or filled but the source
	// and destination register class don't match. For example:
	//
	// %0 = COPY %xzr; GPR64common:%0
	//
	// In this case we can still safely fold away the COPY and generate the
	// following spill code:
	//
	// STRXui %xzr, %stack.0
	//
	// This also eliminates spilled cross register class COPYs (e.g. between x and
	// d regs) of the same size. For example:
	//
	// %0 = COPY %1; GPR64:%0, FPR64:%1
	//
	// will be filled as
	//
	// LDRDui %0, fi<#0>
	//
	// instead of
	//
	// LDRXui %Temp, fi<#0>
	// %0 = FMOV %Temp
	//
	if (MI.isCopy() && Ops.size() == 1 &&
	// Make sure we're only folding the explicit COPY defs/uses.
	(Ops[0] == 0 \|\| Ops[0] == 1)) {
	bool IsSpill = Ops[0] == 0;
	bool IsFill = !IsSpill;
	const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	MachineBasicBlock &MBB = *MI.getParent();
	const MachineOperand &DstMO = MI.getOperand(0);
	const MachineOperand &SrcMO = MI.getOperand(1);
	unsigned DstReg = DstMO.getReg();
	unsigned SrcReg = SrcMO.getReg();
	// This is slightly expensive to compute for physical regs since
	// getMinimalPhysRegClass is slow.
	auto getRegClass = [&](unsigned Reg) {
	return TargetRegisterInfo::isVirtualRegister(Reg)
	? MRI.getRegClass(Reg)
	: TRI.getMinimalPhysRegClass(Reg);
	};

	if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
	assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
	TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
	"Mismatched register size in non subreg COPY");
	if (IsSpill)
	storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
	getRegClass(SrcReg), &TRI);
	else
	loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
	getRegClass(DstReg), &TRI);
	return &*--InsertPt;
	}

	// Handle cases like spilling def of:
	//
	// %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
	//
	// where the physical register source can be widened and stored to the full
	// virtual reg destination stack slot, in this case producing:
	//
	// STRXui %xzr, %stack.0
	//
	if (IsSpill && DstMO.isUndef() &&
	TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
	assert(SrcMO.getSubReg() == 0 &&
	"Unexpected subreg on physical register");
	const TargetRegisterClass *SpillRC;
	unsigned SpillSubreg;
	switch (DstMO.getSubReg()) {
	default:
	SpillRC = nullptr;
	break;
	case AArch64::sub_32:
	case AArch64::ssub:
	if (AArch64::GPR32RegClass.contains(SrcReg)) {
	SpillRC = &AArch64::GPR64RegClass;
	SpillSubreg = AArch64::sub_32;
	} else if (AArch64::FPR32RegClass.contains(SrcReg)) {
	SpillRC = &AArch64::FPR64RegClass;
	SpillSubreg = AArch64::ssub;
	} else
	SpillRC = nullptr;
	break;
	case AArch64::dsub:
	if (AArch64::FPR64RegClass.contains(SrcReg)) {
	SpillRC = &AArch64::FPR128RegClass;
	SpillSubreg = AArch64::dsub;
	} else
	SpillRC = nullptr;
	break;
	}

	if (SpillRC)
	if (unsigned WidenedSrcReg =
	TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
	storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
	FrameIndex, SpillRC, &TRI);
	return &*--InsertPt;
	}
	}

	// Handle cases like filling use of:
	//
	// %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
	//
	// where we can load the full virtual reg source stack slot, into the subreg
	// destination, in this case producing:
	//
	// LDRWui %0:sub_32<def,read-undef>, %stack.0
	//
	if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
	const TargetRegisterClass *FillRC;
	switch (DstMO.getSubReg()) {
	default:
	FillRC = nullptr;
	break;
	case AArch64::sub_32:
	FillRC = &AArch64::GPR32RegClass;
	break;
	case AArch64::ssub:
	FillRC = &AArch64::FPR32RegClass;
	break;
	case AArch64::dsub:
	FillRC = &AArch64::FPR64RegClass;
	break;
	}

	if (FillRC) {
	assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
	TRI.getRegSizeInBits(*FillRC) &&
	"Mismatched regclass size on folded subreg COPY");
	loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
	MachineInstr &LoadMI = *--InsertPt;
	MachineOperand &LoadDst = LoadMI.getOperand(0);
	assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
	LoadDst.setSubReg(DstMO.getSubReg());
	LoadDst.setIsUndef();
	return &LoadMI;
	}
	}
	}

	// Cannot fold.
	return nullptr;
	}

	int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
	bool *OutUseUnscaledOp,
	unsigned *OutUnscaledOp,
	int *EmittableOffset) {
	// Set output values in case of early exit.
	if (EmittableOffset)
	*EmittableOffset = 0;
	if (OutUseUnscaledOp)
	*OutUseUnscaledOp = false;
	if (OutUnscaledOp)
	*OutUnscaledOp = 0;

	// Exit early for structured vector spills/fills as they can't take an
	// immediate offset.
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::LD1Twov2d:
	case AArch64::LD1Threev2d:
	case AArch64::LD1Fourv2d:
	case AArch64::LD1Twov1d:
	case AArch64::LD1Threev1d:
	case AArch64::LD1Fourv1d:
	case AArch64::ST1Twov2d:
	case AArch64::ST1Threev2d:
	case AArch64::ST1Fourv2d:
	case AArch64::ST1Twov1d:
	case AArch64::ST1Threev1d:
	case AArch64::ST1Fourv1d:
	case AArch64::IRG:
	case AArch64::IRGstack:
	return AArch64FrameOffsetCannotUpdate;
	}

	// Get the min/max offset and the scale.
	unsigned Scale, Width;
	int64_t MinOff, MaxOff;
	if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff,
	MaxOff))
	llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");

	// Construct the complete offset.
	const MachineOperand &ImmOpnd =
	MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
	Offset += ImmOpnd.getImm() * Scale;

	// If the offset doesn't match the scale, we rewrite the instruction to
	// use the unscaled instruction instead. Likewise, if we have a negative
	// offset and there is an unscaled op to use.
	Optional<unsigned> UnscaledOp =
	AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
	bool useUnscaledOp = UnscaledOp && (Offset % Scale \|\| Offset < 0);
	if (useUnscaledOp &&
	!AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff))
	llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");

	int64_t Remainder = Offset % Scale;
	assert(!(Remainder && useUnscaledOp) &&
	"Cannot have remainder when using unscaled op");

	assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
	int NewOffset = Offset / Scale;
	if (MinOff <= NewOffset && NewOffset <= MaxOff)
	Offset = Remainder;
	else {
	NewOffset = NewOffset < 0 ? MinOff : MaxOff;
	Offset = Offset - NewOffset * Scale + Remainder;
	}

	if (EmittableOffset)
	*EmittableOffset = NewOffset;
	if (OutUseUnscaledOp)
	*OutUseUnscaledOp = useUnscaledOp;
	if (OutUnscaledOp && UnscaledOp)
	OutUnscaledOp = UnscaledOp;

	return AArch64FrameOffsetCanUpdate \|
	(Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
	}

	bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
	unsigned FrameReg, int &Offset,
	const AArch64InstrInfo *TII) {
	unsigned Opcode = MI.getOpcode();
	unsigned ImmIdx = FrameRegIdx + 1;

	if (Opcode == AArch64::ADDSXri \|\| Opcode == AArch64::ADDXri) {
	Offset += MI.getOperand(ImmIdx).getImm();
	emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
	MI.getOperand(0).getReg(), FrameReg, Offset, TII,
	MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
	MI.eraseFromParent();
	Offset = 0;
	return true;
	}

	int NewOffset;
	unsigned UnscaledOp;
	bool UseUnscaledOp;
	int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
	&UnscaledOp, &NewOffset);
	if (Status & AArch64FrameOffsetCanUpdate) {
	if (Status & AArch64FrameOffsetIsLegal)
	// Replace the FrameIndex with FrameReg.
	MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
	if (UseUnscaledOp)
	MI.setDesc(TII->get(UnscaledOp));

	MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
	return Offset == 0;
	}

	return false;
	}

	void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
	NopInst.setOpcode(AArch64::HINT);
	NopInst.addOperand(MCOperand::createImm(0));
	}

	// AArch64 supports MachineCombiner.
	bool AArch64InstrInfo::useMachineCombiner() const { return true; }

	// True when Opc sets flag
	static bool isCombineInstrSettingFlag(unsigned Opc) {
	switch (Opc) {
	case AArch64::ADDSWrr:
	case AArch64::ADDSWri:
	case AArch64::ADDSXrr:
	case AArch64::ADDSXri:
	case AArch64::SUBSWrr:
	case AArch64::SUBSXrr:
	// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
	case AArch64::SUBSWri:
	case AArch64::SUBSXri:
	return true;
	default:
	break;
	}
	return false;
	}

	// 32b Opcodes that can be combined with a MUL
	static bool isCombineInstrCandidate32(unsigned Opc) {
	switch (Opc) {
	case AArch64::ADDWrr:
	case AArch64::ADDWri:
	case AArch64::SUBWrr:
	case AArch64::ADDSWrr:
	case AArch64::ADDSWri:
	case AArch64::SUBSWrr:
	// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
	case AArch64::SUBWri:
	case AArch64::SUBSWri:
	return true;
	default:
	break;
	}
	return false;
	}

	// 64b Opcodes that can be combined with a MUL
	static bool isCombineInstrCandidate64(unsigned Opc) {
	switch (Opc) {
	case AArch64::ADDXrr:
	case AArch64::ADDXri:
	case AArch64::SUBXrr:
	case AArch64::ADDSXrr:
	case AArch64::ADDSXri:
	case AArch64::SUBSXrr:
	// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
	case AArch64::SUBXri:
	case AArch64::SUBSXri:
	return true;
	default:
	break;
	}
	return false;
	}

	// FP Opcodes that can be combined with a FMUL
	static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
	switch (Inst.getOpcode()) {
	default:
	break;
	case AArch64::FADDSrr:
	case AArch64::FADDDrr:
	case AArch64::FADDv2f32:
	case AArch64::FADDv2f64:
	case AArch64::FADDv4f32:
	case AArch64::FSUBSrr:
	case AArch64::FSUBDrr:
	case AArch64::FSUBv2f32:
	case AArch64::FSUBv2f64:
	case AArch64::FSUBv4f32:
	TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
	return (Options.UnsafeFPMath \|\|
	Options.AllowFPOpFusion == FPOpFusion::Fast);
	}
	return false;
	}

	// Opcodes that can be combined with a MUL
	static bool isCombineInstrCandidate(unsigned Opc) {
	return (isCombineInstrCandidate32(Opc) \|\| isCombineInstrCandidate64(Opc));
	}

	//
	// Utility routine that checks if \param MO is defined by an
	// \param CombineOpc instruction in the basic block \param MBB
	static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
	unsigned CombineOpc, unsigned ZeroReg = 0,
	bool CheckZeroReg = false) {
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	MachineInstr *MI = nullptr;

	if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
	MI = MRI.getUniqueVRegDef(MO.getReg());
	// And it needs to be in the trace (otherwise, it won't have a depth).
	if (!MI \|\| MI->getParent() != &MBB \|\| (unsigned)MI->getOpcode() != CombineOpc)
	return false;
	// Must only used by the user we combine with.
	if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
	return false;

	if (CheckZeroReg) {
	assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
	MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
	MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
	// The third input reg must be zero.
	if (MI->getOperand(3).getReg() != ZeroReg)
	return false;
	}

	return true;
	}

	//
	// Is \param MO defined by an integer multiply and can be combined?
	static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
	unsigned MulOpc, unsigned ZeroReg) {
	return canCombine(MBB, MO, MulOpc, ZeroReg, true);
	}

	//
	// Is \param MO defined by a floating-point multiply and can be combined?
	static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
	unsigned MulOpc) {
	return canCombine(MBB, MO, MulOpc);
	}

	// TODO: There are many more machine instruction opcodes to match:
	// 1. Other data types (integer, vectors)
	// 2. Other math / logic operations (xor, or)
	// 3. Other forms of the same operation (intrinsics and other variants)
	bool AArch64InstrInfo::isAssociativeAndCommutative(
	const MachineInstr &Inst) const {
	switch (Inst.getOpcode()) {
	case AArch64::FADDDrr:
	case AArch64::FADDSrr:
	case AArch64::FADDv2f32:
	case AArch64::FADDv2f64:
	case AArch64::FADDv4f32:
	case AArch64::FMULDrr:
	case AArch64::FMULSrr:
	case AArch64::FMULX32:
	case AArch64::FMULX64:
	case AArch64::FMULXv2f32:
	case AArch64::FMULXv2f64:
	case AArch64::FMULXv4f32:
	case AArch64::FMULv2f32:
	case AArch64::FMULv2f64:
	case AArch64::FMULv4f32:
	return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
	default:
	return false;
	}
	}

	/// Find instructions that can be turned into madd.
	static bool getMaddPatterns(MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) {
	unsigned Opc = Root.getOpcode();
	MachineBasicBlock &MBB = *Root.getParent();
	bool Found = false;

	if (!isCombineInstrCandidate(Opc))
	return false;
	if (isCombineInstrSettingFlag(Opc)) {
	int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
	// When NZCV is live bail out.
	if (Cmp_NZCV == -1)
	return false;
	unsigned NewOpc = convertToNonFlagSettingOpc(Root);
	// When opcode can't change bail out.
	// CHECKME: do we miss any cases for opcode conversion?
	if (NewOpc == Opc)
	return false;
	Opc = NewOpc;
	}

	switch (Opc) {
	default:
	break;
	case AArch64::ADDWrr:
	assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
	"ADDWrr does not have register operands");
	if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
	AArch64::WZR)) {
	Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
	Found = true;
	}
	if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
	AArch64::WZR)) {
	Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
	Found = true;
	}
	break;
	case AArch64::ADDXrr:
	if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
	AArch64::XZR)) {
	Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
	Found = true;
	}
	if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
	AArch64::XZR)) {
	Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
	Found = true;
	}
	break;
	case AArch64::SUBWrr:
	if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
	AArch64::WZR)) {
	Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
	Found = true;
	}
	if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
	AArch64::WZR)) {
	Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
	Found = true;
	}
	break;
	case AArch64::SUBXrr:
	if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
	AArch64::XZR)) {
	Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
	Found = true;
	}
	if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
	AArch64::XZR)) {
	Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
	Found = true;
	}
	break;
	case AArch64::ADDWri:
	if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
	AArch64::WZR)) {
	Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
	Found = true;
	}
	break;
	case AArch64::ADDXri:
	if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
	AArch64::XZR)) {
	Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
	Found = true;
	}
	break;
	case AArch64::SUBWri:
	if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
	AArch64::WZR)) {
	Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
	Found = true;
	}
	break;
	case AArch64::SUBXri:
	if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
	AArch64::XZR)) {
	Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
	Found = true;
	}
	break;
	}
	return Found;
	}
	/// Floating-Point Support

	/// Find instructions that can be turned into madd.
	static bool getFMAPatterns(MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) {

	if (!isCombineInstrCandidateFP(Root))
	return false;

	MachineBasicBlock &MBB = *Root.getParent();
	bool Found = false;

	switch (Root.getOpcode()) {
	default:
	assert(false && "Unsupported FP instruction in combiner\n");
	break;
	case AArch64::FADDSrr:
	assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
	"FADDWrr does not have register operands");
	if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
	Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv1i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
	Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv1i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
	Found = true;
	}
	break;
	case AArch64::FADDDrr:
	if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
	Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv1i64_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
	Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv1i64_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
	Found = true;
	}
	break;
	case AArch64::FADDv2f32:
	if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv2i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv2f32)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv2i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv2f32)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
	Found = true;
	}
	break;
	case AArch64::FADDv2f64:
	if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv2i64_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv2f64)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv2i64_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv2f64)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
	Found = true;
	}
	break;
	case AArch64::FADDv4f32:
	if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv4i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv4f32)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv4i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv4f32)) {
	Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
	Found = true;
	}
	break;

	case AArch64::FSUBSrr:
	if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
	Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
	Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv1i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
	Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
	Found = true;
	}
	break;
	case AArch64::FSUBDrr:
	if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
	Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
	Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv1i64_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
	Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
	Found = true;
	}
	break;
	case AArch64::FSUBv2f32:
	if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv2i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv2f32)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv2i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv2f32)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
	Found = true;
	}
	break;
	case AArch64::FSUBv2f64:
	if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv2i64_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv2f64)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv2i64_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv2f64)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1);
	Found = true;
	}
	break;
	case AArch64::FSUBv4f32:
	if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv4i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
	AArch64::FMULv4f32)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
	Found = true;
	}
	if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv4i32_indexed)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1);
	Found = true;
	} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
	AArch64::FMULv4f32)) {
	Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1);
	Found = true;
	}
	break;
	}
	return Found;
	}

	/// Return true when a code sequence can improve throughput. It
	/// should be called only for instructions in loops.
	/// \param Pattern - combiner pattern
	bool AArch64InstrInfo::isThroughputPattern(
	MachineCombinerPattern Pattern) const {
	switch (Pattern) {
	default:
	break;
	case MachineCombinerPattern::FMULADDS_OP1:
	case MachineCombinerPattern::FMULADDS_OP2:
	case MachineCombinerPattern::FMULSUBS_OP1:
	case MachineCombinerPattern::FMULSUBS_OP2:
	case MachineCombinerPattern::FMULADDD_OP1:
	case MachineCombinerPattern::FMULADDD_OP2:
	case MachineCombinerPattern::FMULSUBD_OP1:
	case MachineCombinerPattern::FMULSUBD_OP2:
	case MachineCombinerPattern::FNMULSUBS_OP1:
	case MachineCombinerPattern::FNMULSUBD_OP1:
	case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
	case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
	case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
	case MachineCombinerPattern::FMLAv2f32_OP2:
	case MachineCombinerPattern::FMLAv2f32_OP1:
	case MachineCombinerPattern::FMLAv2f64_OP1:
	case MachineCombinerPattern::FMLAv2f64_OP2:
	case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
	case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
	case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
	case MachineCombinerPattern::FMLAv4f32_OP1:
	case MachineCombinerPattern::FMLAv4f32_OP2:
	case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
	case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
	case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
	case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
	case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
	case MachineCombinerPattern::FMLSv2f32_OP2:
	case MachineCombinerPattern::FMLSv2f64_OP2:
	case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
	case MachineCombinerPattern::FMLSv4f32_OP2:
	return true;
	} // end switch (Pattern)
	return false;
	}
	/// Return true when there is potentially a faster code sequence for an
	/// instruction chain ending in \p Root. All potential patterns are listed in
	/// the \p Pattern vector. Pattern should be sorted in priority order since the
	/// pattern evaluator stops checking as soon as it finds a faster sequence.

	bool AArch64InstrInfo::getMachineCombinerPatterns(
	MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
	// Integer patterns
	if (getMaddPatterns(Root, Patterns))
	return true;
	// Floating point patterns
	if (getFMAPatterns(Root, Patterns))
	return true;

	return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
	}

	enum class FMAInstKind { Default, Indexed, Accumulator };
	/// genFusedMultiply - Generate fused multiply instructions.
	/// This function supports both integer and floating point instructions.
	/// A typical example:
	/// F\|MUL I=A,B,0
	/// F\|ADD R,I,C
	/// ==> F\|MADD R,A,B,C
	/// \param MF Containing MachineFunction
	/// \param MRI Register information
	/// \param TII Target information
	/// \param Root is the F\|ADD instruction
	/// \param [out] InsInstrs is a vector of machine instructions and will
	/// contain the generated madd instruction
	/// \param IdxMulOpd is index of operand in Root that is the result of
	/// the F\|MUL. In the example above IdxMulOpd is 1.
	/// \param MaddOpc the opcode fo the f\|madd instruction
	/// \param RC Register class of operands
	/// \param kind of fma instruction (addressing mode) to be generated
	/// \param ReplacedAddend is the result register from the instruction
	/// replacing the non-combined operand, if any.
	static MachineInstr *
	genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
	const TargetInstrInfo *TII, MachineInstr &Root,
	SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
	unsigned MaddOpc, const TargetRegisterClass *RC,
	FMAInstKind kind = FMAInstKind::Default,
	const unsigned *ReplacedAddend = nullptr) {
	assert(IdxMulOpd == 1 \|\| IdxMulOpd == 2);

	unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
	MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
	unsigned ResultReg = Root.getOperand(0).getReg();
	unsigned SrcReg0 = MUL->getOperand(1).getReg();
	bool Src0IsKill = MUL->getOperand(1).isKill();
	unsigned SrcReg1 = MUL->getOperand(2).getReg();
	bool Src1IsKill = MUL->getOperand(2).isKill();

	unsigned SrcReg2;
	bool Src2IsKill;
	if (ReplacedAddend) {
	// If we just generated a new addend, we must be it's only use.
	SrcReg2 = *ReplacedAddend;
	Src2IsKill = true;
	} else {
	SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
	Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
	}

	if (TargetRegisterInfo::isVirtualRegister(ResultReg))
	MRI.constrainRegClass(ResultReg, RC);
	if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
	MRI.constrainRegClass(SrcReg0, RC);
	if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
	MRI.constrainRegClass(SrcReg1, RC);
	if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
	MRI.constrainRegClass(SrcReg2, RC);

	MachineInstrBuilder MIB;
	if (kind == FMAInstKind::Default)
	MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
	.addReg(SrcReg0, getKillRegState(Src0IsKill))
	.addReg(SrcReg1, getKillRegState(Src1IsKill))
	.addReg(SrcReg2, getKillRegState(Src2IsKill));
	else if (kind == FMAInstKind::Indexed)
	MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
	.addReg(SrcReg2, getKillRegState(Src2IsKill))
	.addReg(SrcReg0, getKillRegState(Src0IsKill))
	.addReg(SrcReg1, getKillRegState(Src1IsKill))
	.addImm(MUL->getOperand(3).getImm());
	else if (kind == FMAInstKind::Accumulator)
	MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
	.addReg(SrcReg2, getKillRegState(Src2IsKill))
	.addReg(SrcReg0, getKillRegState(Src0IsKill))
	.addReg(SrcReg1, getKillRegState(Src1IsKill));
	else
	assert(false && "Invalid FMA instruction kind \n");
	// Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
	InsInstrs.push_back(MIB);
	return MUL;
	}

	/// genMaddR - Generate madd instruction and combine mul and add using
	/// an extra virtual register
	/// Example - an ADD intermediate needs to be stored in a register:
	/// MUL I=A,B,0
	/// ADD R,I,Imm
	/// ==> ORR V, ZR, Imm
	/// ==> MADD R,A,B,V
	/// \param MF Containing MachineFunction
	/// \param MRI Register information
	/// \param TII Target information
	/// \param Root is the ADD instruction
	/// \param [out] InsInstrs is a vector of machine instructions and will
	/// contain the generated madd instruction
	/// \param IdxMulOpd is index of operand in Root that is the result of
	/// the MUL. In the example above IdxMulOpd is 1.
	/// \param MaddOpc the opcode fo the madd instruction
	/// \param VR is a virtual register that holds the value of an ADD operand
	/// (V in the example above).
	/// \param RC Register class of operands
	static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
	const TargetInstrInfo *TII, MachineInstr &Root,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
	const TargetRegisterClass *RC) {
	assert(IdxMulOpd == 1 \|\| IdxMulOpd == 2);

	MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
	unsigned ResultReg = Root.getOperand(0).getReg();
	unsigned SrcReg0 = MUL->getOperand(1).getReg();
	bool Src0IsKill = MUL->getOperand(1).isKill();
	unsigned SrcReg1 = MUL->getOperand(2).getReg();
	bool Src1IsKill = MUL->getOperand(2).isKill();

	if (TargetRegisterInfo::isVirtualRegister(ResultReg))
	MRI.constrainRegClass(ResultReg, RC);
	if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
	MRI.constrainRegClass(SrcReg0, RC);
	if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
	MRI.constrainRegClass(SrcReg1, RC);
	if (TargetRegisterInfo::isVirtualRegister(VR))
	MRI.constrainRegClass(VR, RC);

	MachineInstrBuilder MIB =
	BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
	.addReg(SrcReg0, getKillRegState(Src0IsKill))
	.addReg(SrcReg1, getKillRegState(Src1IsKill))
	.addReg(VR);
	// Insert the MADD
	InsInstrs.push_back(MIB);
	return MUL;
	}

	/// When getMachineCombinerPatterns() finds potential patterns,
	/// this function generates the instructions that could replace the
	/// original code sequence
	void AArch64InstrInfo::genAlternativeCodeSequence(
	MachineInstr &Root, MachineCombinerPattern Pattern,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	SmallVectorImpl<MachineInstr *> &DelInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
	MachineBasicBlock &MBB = *Root.getParent();
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	MachineFunction &MF = *MBB.getParent();
	const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();

	MachineInstr *MUL;
	const TargetRegisterClass *RC;
	unsigned Opc;
	switch (Pattern) {
	default:
	// Reassociate instructions.
	TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
	DelInstrs, InstrIdxForVirtReg);
	return;
	case MachineCombinerPattern::MULADDW_OP1:
	case MachineCombinerPattern::MULADDX_OP1:
	// MUL I=A,B,0
	// ADD R,I,C
	// ==> MADD R,A,B,C
	// --- Create(MADD);
	if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDW_OP2:
	case MachineCombinerPattern::MULADDX_OP2:
	// MUL I=A,B,0
	// ADD R,C,I
	// ==> MADD R,A,B,C
	// --- Create(MADD);
	if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULADDWI_OP1:
	case MachineCombinerPattern::MULADDXI_OP1: {
	// MUL I=A,B,0
	// ADD R,I,Imm
	// ==> ORR V, ZR, Imm
	// ==> MADD R,A,B,V
	// --- Create(MADD);
	const TargetRegisterClass *OrrRC;
	unsigned BitSize, OrrOpc, ZeroReg;
	if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
	OrrOpc = AArch64::ORRWri;
	OrrRC = &AArch64::GPR32spRegClass;
	BitSize = 32;
	ZeroReg = AArch64::WZR;
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	OrrOpc = AArch64::ORRXri;
	OrrRC = &AArch64::GPR64spRegClass;
	BitSize = 64;
	ZeroReg = AArch64::XZR;
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	unsigned NewVR = MRI.createVirtualRegister(OrrRC);
	uint64_t Imm = Root.getOperand(2).getImm();

	if (Root.getOperand(3).isImm()) {
	unsigned Val = Root.getOperand(3).getImm();
	Imm = Imm << Val;
	}
	uint64_t UImm = SignExtend64(Imm, BitSize);
	uint64_t Encoding;
	if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
	.addReg(ZeroReg)
	.addImm(Encoding);
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
	}
	break;
	}
	case MachineCombinerPattern::MULSUBW_OP1:
	case MachineCombinerPattern::MULSUBX_OP1: {
	// MUL I=A,B,0
	// SUB R,I, C
	// ==> SUB V, 0, C
	// ==> MADD R,A,B,V // = -C + A*B
	// --- Create(MADD);
	const TargetRegisterClass *SubRC;
	unsigned SubOpc, ZeroReg;
	if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
	SubOpc = AArch64::SUBWrr;
	SubRC = &AArch64::GPR32spRegClass;
	ZeroReg = AArch64::WZR;
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	SubOpc = AArch64::SUBXrr;
	SubRC = &AArch64::GPR64spRegClass;
	ZeroReg = AArch64::XZR;
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	unsigned NewVR = MRI.createVirtualRegister(SubRC);
	// SUB NewVR, 0, C
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
	.addReg(ZeroReg)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
	break;
	}
	case MachineCombinerPattern::MULSUBW_OP2:
	case MachineCombinerPattern::MULSUBX_OP2:
	// MUL I=A,B,0
	// SUB R,C,I
	// ==> MSUB R,A,B,C (computes C - A*B)
	// --- Create(MSUB);
	if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
	Opc = AArch64::MSUBWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	Opc = AArch64::MSUBXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	case MachineCombinerPattern::MULSUBWI_OP1:
	case MachineCombinerPattern::MULSUBXI_OP1: {
	// MUL I=A,B,0
	// SUB R,I, Imm
	// ==> ORR V, ZR, -Imm
	// ==> MADD R,A,B,V // = -Imm + A*B
	// --- Create(MADD);
	const TargetRegisterClass *OrrRC;
	unsigned BitSize, OrrOpc, ZeroReg;
	if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
	OrrOpc = AArch64::ORRWri;
	OrrRC = &AArch64::GPR32spRegClass;
	BitSize = 32;
	ZeroReg = AArch64::WZR;
	Opc = AArch64::MADDWrrr;
	RC = &AArch64::GPR32RegClass;
	} else {
	OrrOpc = AArch64::ORRXri;
	OrrRC = &AArch64::GPR64spRegClass;
	BitSize = 64;
	ZeroReg = AArch64::XZR;
	Opc = AArch64::MADDXrrr;
	RC = &AArch64::GPR64RegClass;
	}
	unsigned NewVR = MRI.createVirtualRegister(OrrRC);
	uint64_t Imm = Root.getOperand(2).getImm();
	if (Root.getOperand(3).isImm()) {
	unsigned Val = Root.getOperand(3).getImm();
	Imm = Imm << Val;
	}
	uint64_t UImm = SignExtend64(-Imm, BitSize);
	uint64_t Encoding;
	if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
	.addReg(ZeroReg)
	.addImm(Encoding);
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
	}
	break;
	}
	// Floating Point Support
	case MachineCombinerPattern::FMULADDS_OP1:
	case MachineCombinerPattern::FMULADDD_OP1:
	// MUL I=A,B,0
	// ADD R,I,C
	// ==> MADD R,A,B,C
	// --- Create(MADD);
	if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
	Opc = AArch64::FMADDSrrr;
	RC = &AArch64::FPR32RegClass;
	} else {
	Opc = AArch64::FMADDDrrr;
	RC = &AArch64::FPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	case MachineCombinerPattern::FMULADDS_OP2:
	case MachineCombinerPattern::FMULADDD_OP2:
	// FMUL I=A,B,0
	// FADD R,C,I
	// ==> FMADD R,A,B,C
	// --- Create(FMADD);
	if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
	Opc = AArch64::FMADDSrrr;
	RC = &AArch64::FPR32RegClass;
	} else {
	Opc = AArch64::FMADDDrrr;
	RC = &AArch64::FPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;

	case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
	Opc = AArch64::FMLAv1i32_indexed;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	break;
	case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
	Opc = AArch64::FMLAv1i32_indexed;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
	Opc = AArch64::FMLAv1i64_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	break;
	case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
	Opc = AArch64::FMLAv1i64_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv2f32_OP1:
	RC = &AArch64::FPR64RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
	Opc = AArch64::FMLAv2i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv2f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;
	case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
	case MachineCombinerPattern::FMLAv2f32_OP2:
	RC = &AArch64::FPR64RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
	Opc = AArch64::FMLAv2i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv2f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
	case MachineCombinerPattern::FMLAv2f64_OP1:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
	Opc = AArch64::FMLAv2i64_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv2f64;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;
	case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
	case MachineCombinerPattern::FMLAv2f64_OP2:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
	Opc = AArch64::FMLAv2i64_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv2f64;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
	case MachineCombinerPattern::FMLAv4f32_OP1:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
	Opc = AArch64::FMLAv4i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv4f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
	case MachineCombinerPattern::FMLAv4f32_OP2:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
	Opc = AArch64::FMLAv4i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLAv4f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMULSUBS_OP1:
	case MachineCombinerPattern::FMULSUBD_OP1: {
	// FMUL I=A,B,0
	// FSUB R,I,C
	// ==> FNMSUB R,A,B,C // = -C + A*B
	// --- Create(FNMSUB);
	if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
	Opc = AArch64::FNMSUBSrrr;
	RC = &AArch64::FPR32RegClass;
	} else {
	Opc = AArch64::FNMSUBDrrr;
	RC = &AArch64::FPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	}

	case MachineCombinerPattern::FNMULSUBS_OP1:
	case MachineCombinerPattern::FNMULSUBD_OP1: {
	// FNMUL I=A,B,0
	// FSUB R,I,C
	// ==> FNMADD R,A,B,C // = -A*B - C
	// --- Create(FNMADD);
	if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
	Opc = AArch64::FNMADDSrrr;
	RC = &AArch64::FPR32RegClass;
	} else {
	Opc = AArch64::FNMADDDrrr;
	RC = &AArch64::FPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
	break;
	}

	case MachineCombinerPattern::FMULSUBS_OP2:
	case MachineCombinerPattern::FMULSUBD_OP2: {
	// FMUL I=A,B,0
	// FSUB R,C,I
	// ==> FMSUB R,A,B,C (computes C - A*B)
	// --- Create(FMSUB);
	if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
	Opc = AArch64::FMSUBSrrr;
	RC = &AArch64::FPR32RegClass;
	} else {
	Opc = AArch64::FMSUBDrrr;
	RC = &AArch64::FPR64RegClass;
	}
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
	break;
	}

	case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
	Opc = AArch64::FMLSv1i32_indexed;
	RC = &AArch64::FPR32RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
	Opc = AArch64::FMLSv1i64_indexed;
	RC = &AArch64::FPR64RegClass;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	break;

	case MachineCombinerPattern::FMLSv2f32_OP2:
	case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
	RC = &AArch64::FPR64RegClass;
	if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
	Opc = AArch64::FMLSv2i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLSv2f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLSv2f64_OP2:
	case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
	Opc = AArch64::FMLSv2i64_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLSv2f64;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;

	case MachineCombinerPattern::FMLSv4f32_OP2:
	case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
	RC = &AArch64::FPR128RegClass;
	if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
	Opc = AArch64::FMLSv4i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Indexed);
	} else {
	Opc = AArch64::FMLSv4f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
	FMAInstKind::Accumulator);
	}
	break;
	case MachineCombinerPattern::FMLSv2f32_OP1:
	case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
	RC = &AArch64::FPR64RegClass;
	unsigned NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
	Opc = AArch64::FMLAv2i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed, &NewVR);
	} else {
	Opc = AArch64::FMLAv2f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator, &NewVR);
	}
	break;
	}
	case MachineCombinerPattern::FMLSv4f32_OP1:
	case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
	RC = &AArch64::FPR128RegClass;
	unsigned NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
	Opc = AArch64::FMLAv4i32_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed, &NewVR);
	} else {
	Opc = AArch64::FMLAv4f32;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator, &NewVR);
	}
	break;
	}
	case MachineCombinerPattern::FMLSv2f64_OP1:
	case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
	RC = &AArch64::FPR128RegClass;
	unsigned NewVR = MRI.createVirtualRegister(RC);
	MachineInstrBuilder MIB1 =
	BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
	.add(Root.getOperand(2));
	InsInstrs.push_back(MIB1);
	InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
	if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
	Opc = AArch64::FMLAv2i64_indexed;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Indexed, &NewVR);
	} else {
	Opc = AArch64::FMLAv2f64;
	MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
	FMAInstKind::Accumulator, &NewVR);
	}
	break;
	}
	} // end switch (Pattern)
	// Record MUL and ADD/SUB for deletion
	DelInstrs.push_back(MUL);
	DelInstrs.push_back(&Root);
	}

	/// Replace csincr-branch sequence by simple conditional branch
	///
	/// Examples:
	/// 1. \code
	/// csinc w9, wzr, wzr, <condition code>
	/// tbnz w9, #0, 0x44
	/// \endcode
	/// to
	/// \code
	/// b.<inverted condition code>
	/// \endcode
	///
	/// 2. \code
	/// csinc w9, wzr, wzr, <condition code>
	/// tbz w9, #0, 0x44
	/// \endcode
	/// to
	/// \code
	/// b.<condition code>
	/// \endcode
	///
	/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
	/// compare's constant operand is power of 2.
	///
	/// Examples:
	/// \code
	/// and w8, w8, #0x400
	/// cbnz w8, L1
	/// \endcode
	/// to
	/// \code
	/// tbnz w8, #10, L1
	/// \endcode
	///
	/// \param MI Conditional Branch
	/// \return True when the simple conditional branch is generated
	///
	bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
	bool IsNegativeBranch = false;
	bool IsTestAndBranch = false;
	unsigned TargetBBInMI = 0;
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Unknown branch instruction?");
	case AArch64::Bcc:
	return false;
	case AArch64::CBZW:
	case AArch64::CBZX:
	TargetBBInMI = 1;
	break;
	case AArch64::CBNZW:
	case AArch64::CBNZX:
	TargetBBInMI = 1;
	IsNegativeBranch = true;
	break;
	case AArch64::TBZW:
	case AArch64::TBZX:
	TargetBBInMI = 2;
	IsTestAndBranch = true;
	break;
	case AArch64::TBNZW:
	case AArch64::TBNZX:
	TargetBBInMI = 2;
	IsNegativeBranch = true;
	IsTestAndBranch = true;
	break;
	}
	// So we increment a zero register and test for bits other
	// than bit 0? Conservatively bail out in case the verifier
	// missed this case.
	if (IsTestAndBranch && MI.getOperand(1).getImm())
	return false;

	// Find Definition.
	assert(MI.getParent() && "Incomplete machine instruciton\n");
	MachineBasicBlock *MBB = MI.getParent();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	unsigned VReg = MI.getOperand(0).getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VReg))
	return false;

	MachineInstr *DefMI = MRI->getVRegDef(VReg);

	// Look through COPY instructions to find definition.
	while (DefMI->isCopy()) {
	unsigned CopyVReg = DefMI->getOperand(1).getReg();
	if (!MRI->hasOneNonDBGUse(CopyVReg))
	return false;
	if (!MRI->hasOneDef(CopyVReg))
	return false;
	DefMI = MRI->getVRegDef(CopyVReg);
	}

	switch (DefMI->getOpcode()) {
	default:
	return false;
	// Fold AND into a TBZ/TBNZ if constant operand is power of 2.
	case AArch64::ANDWri:
	case AArch64::ANDXri: {
	if (IsTestAndBranch)
	return false;
	if (DefMI->getParent() != MBB)
	return false;
	if (!MRI->hasOneNonDBGUse(VReg))
	return false;

	bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
	uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
	DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
	if (!isPowerOf2_64(Mask))
	return false;

	MachineOperand &MO = DefMI->getOperand(1);
	unsigned NewReg = MO.getReg();
	if (!TargetRegisterInfo::isVirtualRegister(NewReg))
	return false;

	assert(!MRI->def_empty(NewReg) && "Register must be defined.");

	MachineBasicBlock &RefToMBB = *MBB;
	MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
	DebugLoc DL = MI.getDebugLoc();
	unsigned Imm = Log2_64(Mask);
	unsigned Opc = (Imm < 32)
	? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
	: (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
	MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
	.addReg(NewReg)
	.addImm(Imm)
	.addMBB(TBB);
	// Register lives on to the CBZ now.
	MO.setIsKill(false);

	// For immediate smaller than 32, we need to use the 32-bit
	// variant (W) in all cases. Indeed the 64-bit variant does not
	// allow to encode them.
	// Therefore, if the input register is 64-bit, we need to take the
	// 32-bit sub-part.
	if (!Is32Bit && Imm < 32)
	NewMI->getOperand(0).setSubReg(AArch64::sub_32);
	MI.eraseFromParent();
	return true;
	}
	// Look for CSINC
	case AArch64::CSINCWr:
	case AArch64::CSINCXr: {
	if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
	DefMI->getOperand(2).getReg() == AArch64::WZR) &&
	!(DefMI->getOperand(1).getReg() == AArch64::XZR &&
	DefMI->getOperand(2).getReg() == AArch64::XZR))
	return false;

	if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
	return false;

	AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
	// Convert only when the condition code is not modified between
	// the CSINC and the branch. The CC may be used by other
	// instructions in between.
	if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
	return false;
	MachineBasicBlock &RefToMBB = *MBB;
	MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
	DebugLoc DL = MI.getDebugLoc();
	if (IsNegativeBranch)
	CC = AArch64CC::getInvertedCondCode(CC);
	BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
	MI.eraseFromParent();
	return true;
	}
	}
	}

	std::pair<unsigned, unsigned>
	AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
	const unsigned Mask = AArch64II::MO_FRAGMENT;
	return std::make_pair(TF & Mask, TF & ~Mask);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
	using namespace AArch64II;

	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
	{MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
	{MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
	{MO_HI12, "aarch64-hi12"}};
	return makeArrayRef(TargetFlags);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
	using namespace AArch64II;

	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{MO_COFFSTUB, "aarch64-coffstub"},
	{MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"},
	{MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"},
	{MO_DLLIMPORT, "aarch64-dllimport"}};
	return makeArrayRef(TargetFlags);
	}

	ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
	AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
	static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
	{{MOSuppressPair, "aarch64-suppress-pair"},
	{MOStridedAccess, "aarch64-strided-access"}};
	return makeArrayRef(TargetFlags);
	}

	/// Constants defining how certain sequences should be outlined.
	/// This encompasses how an outlined function should be called, and what kind of
	/// frame should be emitted for that outlined function.
	///
	/// \p MachineOutlinerDefault implies that the function should be called with
	/// a save and restore of LR to the stack.
	///
	/// That is,
	///
	/// I1 Save LR OUTLINED_FUNCTION:
	/// I2 --> BL OUTLINED_FUNCTION I1
	/// I3 Restore LR I2
	/// I3
	/// RET
	///
	/// * Call construction overhead: 3 (save + BL + restore)
	/// * Frame construction overhead: 1 (ret)
	/// * Requires stack fixups? Yes
	///
	/// \p MachineOutlinerTailCall implies that the function is being created from
	/// a sequence of instructions ending in a return.
	///
	/// That is,
	///
	/// I1 OUTLINED_FUNCTION:
	/// I2 --> B OUTLINED_FUNCTION I1
	/// RET I2
	/// RET
	///
	/// * Call construction overhead: 1 (B)
	/// * Frame construction overhead: 0 (Return included in sequence)
	/// * Requires stack fixups? No
	///
	/// \p MachineOutlinerNoLRSave implies that the function should be called using
	/// a BL instruction, but doesn't require LR to be saved and restored. This
	/// happens when LR is known to be dead.
	///
	/// That is,
	///
	/// I1 OUTLINED_FUNCTION:
	/// I2 --> BL OUTLINED_FUNCTION I1
	/// I3 I2
	/// I3
	/// RET
	///
	/// * Call construction overhead: 1 (BL)
	/// * Frame construction overhead: 1 (RET)
	/// * Requires stack fixups? No
	///
	/// \p MachineOutlinerThunk implies that the function is being created from
	/// a sequence of instructions ending in a call. The outlined function is
	/// called with a BL instruction, and the outlined function tail-calls the
	/// original call destination.
	///
	/// That is,
	///
	/// I1 OUTLINED_FUNCTION:
	/// I2 --> BL OUTLINED_FUNCTION I1
	/// BL f I2
	/// B f
	/// * Call construction overhead: 1 (BL)
	/// * Frame construction overhead: 0
	/// * Requires stack fixups? No
	///
	/// \p MachineOutlinerRegSave implies that the function should be called with a
	/// save and restore of LR to an available register. This allows us to avoid
	/// stack fixups. Note that this outlining variant is compatible with the
	/// NoLRSave case.
	///
	/// That is,
	///
	/// I1 Save LR OUTLINED_FUNCTION:
	/// I2 --> BL OUTLINED_FUNCTION I1
	/// I3 Restore LR I2
	/// I3
	/// RET
	///
	/// * Call construction overhead: 3 (save + BL + restore)
	/// * Frame construction overhead: 1 (ret)
	/// * Requires stack fixups? No
	enum MachineOutlinerClass {
	MachineOutlinerDefault, /// Emit a save, restore, call, and return.
	MachineOutlinerTailCall, /// Only emit a branch.
	MachineOutlinerNoLRSave, /// Emit a call and return.
	MachineOutlinerThunk, /// Emit a call and tail-call.
	MachineOutlinerRegSave /// Same as default, but save to a register.
	};

	enum MachineOutlinerMBBFlags {
	LRUnavailableSomewhere = 0x2,
	HasCalls = 0x4,
	UnsafeRegsDead = 0x8
	};

	unsigned
	AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
	assert(C.LRUWasSet && "LRU wasn't set?");
	MachineFunction *MF = C.getMF();
	const AArch64RegisterInfo ARI = static_cast<const AArch64RegisterInfo >(
	MF->getSubtarget().getRegisterInfo());

	// Check if there is an available register across the sequence that we can
	// use.
	for (unsigned Reg : AArch64::GPR64RegClass) {
	if (!ARI->isReservedReg(*MF, Reg) &&
	Reg != AArch64::LR && // LR is not reserved, but don't use it.
	Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
	Reg != AArch64::X17 && // Ditto for X17.
	C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
	return Reg;
	}

	// No suitable register. Return 0.
	return 0u;
	}

	outliner::OutlinedFunction
	AArch64InstrInfo::getOutliningCandidateInfo(
	std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
	outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
	unsigned SequenceSize =
	std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
	[this](unsigned Sum, const MachineInstr &MI) {
	return Sum + getInstSizeInBytes(MI);
	});

	// Properties about candidate MBBs that hold for all of them.
	unsigned FlagsSetInAll = 0xF;

	// Compute liveness information for each candidate, and set FlagsSetInAll.
	const TargetRegisterInfo &TRI = getRegisterInfo();
	std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
	[&FlagsSetInAll](outliner::Candidate &C) {
	FlagsSetInAll &= C.Flags;
	});

	// According to the AArch64 Procedure Call Standard, the following are
	// undefined on entry/exit from a function call:
	//
	// * Registers x16, x17, (and thus w16, w17)
	// * Condition codes (and thus the NZCV register)
	//
	// Because if this, we can't outline any sequence of instructions where
	// one
	// of these registers is live into/across it. Thus, we need to delete
	// those
	// candidates.
	auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
	// If the unsafe registers in this block are all dead, then we don't need
	// to compute liveness here.
	if (C.Flags & UnsafeRegsDead)
	return false;
	C.initLRU(TRI);
	LiveRegUnits LRU = C.LRU;
	return (!LRU.available(AArch64::W16) \|\| !LRU.available(AArch64::W17) \|\|
	!LRU.available(AArch64::NZCV));
	};

	// Are there any candidates where those registers are live?
	if (!(FlagsSetInAll & UnsafeRegsDead)) {
	// Erase every candidate that violates the restrictions above. (It could be
	// true that we have viable candidates, so it's not worth bailing out in
	// the case that, say, 1 out of 20 candidates violate the restructions.)
	RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
	RepeatedSequenceLocs.end(),
	CantGuaranteeValueAcrossCall),
	RepeatedSequenceLocs.end());

	// If the sequence doesn't have enough candidates left, then we're done.
	if (RepeatedSequenceLocs.size() < 2)
	return outliner::OutlinedFunction();
	}

	// At this point, we have only "safe" candidates to outline. Figure out
	// frame + call instruction information.

	unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();

	// Helper lambda which sets call information for every candidate.
	auto SetCandidateCallInfo =
	[&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
	for (outliner::Candidate &C : RepeatedSequenceLocs)
	C.setCallInfo(CallID, NumBytesForCall);
	};

	unsigned FrameID = MachineOutlinerDefault;
	unsigned NumBytesToCreateFrame = 4;

	bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
	return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
	});

	// Returns true if an instructions is safe to fix up, false otherwise.
	auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
	if (MI.isCall())
	return true;

	if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
	!MI.readsRegister(AArch64::SP, &TRI))
	return true;

	// Any modification of SP will break our code to save/restore LR.
	// FIXME: We could handle some instructions which add a constant
	// offset to SP, with a bit more work.
	if (MI.modifiesRegister(AArch64::SP, &TRI))
	return false;

	// At this point, we have a stack instruction that we might need to
	// fix up. We'll handle it if it's a load or store.
	if (MI.mayLoadOrStore()) {
	const MachineOperand *Base; // Filled with the base operand of MI.
	int64_t Offset; // Filled with the offset of MI.

	// Does it allow us to offset the base operand and is the base the
	// register SP?
	if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) \|\| !Base->isReg() \|\|
	Base->getReg() != AArch64::SP)
	return false;

	// Find the minimum/maximum offset for this instruction and check
	// if fixing it up would be in range.
	int64_t MinOffset,
	MaxOffset; // Unscaled offsets for the instruction.
	unsigned Scale; // The scale to multiply the offsets by.
	unsigned DummyWidth;
	getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);

	Offset += 16; // Update the offset to what it would be if we outlined.
	if (Offset < MinOffset * Scale \|\| Offset > MaxOffset * Scale)
	return false;

	// It's in range, so we can outline it.
	return true;
	}

	// FIXME: Add handling for instructions like "add x0, sp, #8".

	// We can't fix it up, so don't outline it.
	return false;
	};

	// True if it's possible to fix up each stack instruction in this sequence.
	// Important for frames/call variants that modify the stack.
	bool AllStackInstrsSafe = std::all_of(
	FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);

	// If the last instruction in any candidate is a terminator, then we should
	// tail call all of the candidates.
	if (RepeatedSequenceLocs[0].back()->isTerminator()) {
	FrameID = MachineOutlinerTailCall;
	NumBytesToCreateFrame = 0;
	SetCandidateCallInfo(MachineOutlinerTailCall, 4);
	}

	else if (LastInstrOpcode == AArch64::BL \|\|
	(LastInstrOpcode == AArch64::BLR && !HasBTI)) {
	// FIXME: Do we need to check if the code after this uses the value of LR?
	FrameID = MachineOutlinerThunk;
	NumBytesToCreateFrame = 0;
	SetCandidateCallInfo(MachineOutlinerThunk, 4);
	}

	else {
	// We need to decide how to emit calls + frames. We can always emit the same
	// frame if we don't need to save to the stack. If we have to save to the
	// stack, then we need a different frame.
	unsigned NumBytesNoStackCalls = 0;
	std::vector<outliner::Candidate> CandidatesWithoutStackFixups;

	for (outliner::Candidate &C : RepeatedSequenceLocs) {
	C.initLRU(TRI);

	// Is LR available? If so, we don't need a save.
	if (C.LRU.available(AArch64::LR)) {
	NumBytesNoStackCalls += 4;
	C.setCallInfo(MachineOutlinerNoLRSave, 4);
	CandidatesWithoutStackFixups.push_back(C);
	}

	// Is an unused register available? If so, we won't modify the stack, so
	// we can outline with the same frame type as those that don't save LR.
	else if (findRegisterToSaveLRTo(C)) {
	NumBytesNoStackCalls += 12;
	C.setCallInfo(MachineOutlinerRegSave, 12);
	CandidatesWithoutStackFixups.push_back(C);
	}

	// Is SP used in the sequence at all? If not, we don't have to modify
	// the stack, so we are guaranteed to get the same frame.
	else if (C.UsedInSequence.available(AArch64::SP)) {
	NumBytesNoStackCalls += 12;
	C.setCallInfo(MachineOutlinerDefault, 12);
	CandidatesWithoutStackFixups.push_back(C);
	}

	// If we outline this, we need to modify the stack. Pretend we don't
	// outline this by saving all of its bytes.
	else {
	NumBytesNoStackCalls += SequenceSize;
	}
	}

	// If there are no places where we have to save LR, then note that we
	// don't have to update the stack. Otherwise, give every candidate the
	// default call type, as long as it's safe to do so.
	if (!AllStackInstrsSafe \|\|
	NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
	RepeatedSequenceLocs = CandidatesWithoutStackFixups;
	FrameID = MachineOutlinerNoLRSave;
	} else {
	SetCandidateCallInfo(MachineOutlinerDefault, 12);
	}

	// If we dropped all of the candidates, bail out here.
	if (RepeatedSequenceLocs.size() < 2) {
	RepeatedSequenceLocs.clear();
	return outliner::OutlinedFunction();
	}
	}

	// Does every candidate's MBB contain a call? If so, then we might have a call
	// in the range.
	if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
	// Check if the range contains a call. These require a save + restore of the
	// link register.
	bool ModStackToSaveLR = false;
	if (std::any_of(FirstCand.front(), FirstCand.back(),
	[](const MachineInstr &MI) { return MI.isCall(); }))
	ModStackToSaveLR = true;

	// Handle the last instruction separately. If this is a tail call, then the
	// last instruction is a call. We don't want to save + restore in this case.
	// However, it could be possible that the last instruction is a call without
	// it being valid to tail call this sequence. We should consider this as
	// well.
	else if (FrameID != MachineOutlinerThunk &&
	FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
	ModStackToSaveLR = true;

	if (ModStackToSaveLR) {
	// We can't fix up the stack. Bail out.
	if (!AllStackInstrsSafe) {
	RepeatedSequenceLocs.clear();
	return outliner::OutlinedFunction();
	}

	// Save + restore LR.
	NumBytesToCreateFrame += 8;
	}
	}

	return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
	NumBytesToCreateFrame, FrameID);
	}

	bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
	MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
	const Function &F = MF.getFunction();

	// Can F be deduplicated by the linker? If it can, don't outline from it.
	if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
	return false;

	// Don't outline from functions with section markings; the program could
	// expect that all the code is in the named section.
	// FIXME: Allow outlining from multiple functions with the same section
	// marking.
	if (F.hasSection())
	return false;

	// Outlining from functions with redzones is unsafe since the outliner may
	// modify the stack. Check if hasRedZone is true or unknown; if yes, don't
	// outline from it.
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	if (!AFI \|\| AFI->hasRedZone().getValueOr(true))
	return false;

	// It's safe to outline from MF.
	return true;
	}

	bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
	unsigned &Flags) const {
	// Check if LR is available through all of the MBB. If it's not, then set
	// a flag.
	assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
	"Suitable Machine Function for outlining must track liveness");
	LiveRegUnits LRU(getRegisterInfo());

	std::for_each(MBB.rbegin(), MBB.rend(),
	[&LRU](MachineInstr &MI) { LRU.accumulate(MI); });

	// Check if each of the unsafe registers are available...
	bool W16AvailableInBlock = LRU.available(AArch64::W16);
	bool W17AvailableInBlock = LRU.available(AArch64::W17);
	bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);

	// If all of these are dead (and not live out), we know we don't have to check
	// them later.
	if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
	Flags \|= MachineOutlinerMBBFlags::UnsafeRegsDead;

	// Now, add the live outs to the set.
	LRU.addLiveOuts(MBB);

	// If any of these registers is available in the MBB, but also a live out of
	// the block, then we know outlining is unsafe.
	if (W16AvailableInBlock && !LRU.available(AArch64::W16))
	return false;
	if (W17AvailableInBlock && !LRU.available(AArch64::W17))
	return false;
	if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
	return false;

	// Check if there's a call inside this MachineBasicBlock. If there is, then
	// set a flag.
	if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
	Flags \|= MachineOutlinerMBBFlags::HasCalls;

	MachineFunction *MF = MBB.getParent();

	// In the event that we outline, we may have to save LR. If there is an
	// available register in the MBB, then we'll always save LR there. Check if
	// this is true.
	bool CanSaveLR = false;
	const AArch64RegisterInfo ARI = static_cast<const AArch64RegisterInfo >(
	MF->getSubtarget().getRegisterInfo());

	// Check if there is an available register across the sequence that we can
	// use.
	for (unsigned Reg : AArch64::GPR64RegClass) {
	if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
	Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
	CanSaveLR = true;
	break;
	}
	}

	// Check if we have a register we can save LR to, and if LR was used
	// somewhere. If both of those things are true, then we need to evaluate the
	// safety of outlining stack instructions later.
	if (!CanSaveLR && !LRU.available(AArch64::LR))
	Flags \|= MachineOutlinerMBBFlags::LRUnavailableSomewhere;

	return true;
	}

	outliner::InstrType
	AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
	unsigned Flags) const {
	MachineInstr &MI = *MIT;
	MachineBasicBlock *MBB = MI.getParent();
	MachineFunction *MF = MBB->getParent();
	AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();

	// Don't outline LOHs.
	if (FuncInfo->getLOHRelated().count(&MI))
	return outliner::InstrType::Illegal;

	// Don't allow debug values to impact outlining type.
	if (MI.isDebugInstr() \|\| MI.isIndirectDebugValue())
	return outliner::InstrType::Invisible;

	// At this point, KILL instructions don't really tell us much so we can go
	// ahead and skip over them.
	if (MI.isKill())
	return outliner::InstrType::Invisible;

	// Is this a terminator for a basic block?
	if (MI.isTerminator()) {

	// Is this the end of a function?
	if (MI.getParent()->succ_empty())
	return outliner::InstrType::Legal;

	// It's not, so don't outline it.
	return outliner::InstrType::Illegal;
	}

	// Make sure none of the operands are un-outlinable.
	for (const MachineOperand &MOP : MI.operands()) {
	if (MOP.isCPI() \|\| MOP.isJTI() \|\| MOP.isCFIIndex() \|\| MOP.isFI() \|\|
	MOP.isTargetIndex())
	return outliner::InstrType::Illegal;

	// If it uses LR or W30 explicitly, then don't touch it.
	if (MOP.isReg() && !MOP.isImplicit() &&
	(MOP.getReg() == AArch64::LR \|\| MOP.getReg() == AArch64::W30))
	return outliner::InstrType::Illegal;
	}

	// Special cases for instructions that can always be outlined, but will fail
	// the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
	// be outlined because they don't require a specific value to be in LR.
	if (MI.getOpcode() == AArch64::ADRP)
	return outliner::InstrType::Legal;

	// If MI is a call we might be able to outline it. We don't want to outline
	// any calls that rely on the position of items on the stack. When we outline
	// something containing a call, we have to emit a save and restore of LR in
	// the outlined function. Currently, this always happens by saving LR to the
	// stack. Thus, if we outline, say, half the parameters for a function call
	// plus the call, then we'll break the callee's expectations for the layout
	// of the stack.
	//
	// FIXME: Allow calls to functions which construct a stack frame, as long
	// as they don't access arguments on the stack.
	// FIXME: Figure out some way to analyze functions defined in other modules.
	// We should be able to compute the memory usage based on the IR calling
	// convention, even if we can't see the definition.
	if (MI.isCall()) {
	// Get the function associated with the call. Look at each operand and find
	// the one that represents the callee and get its name.
	const Function *Callee = nullptr;
	for (const MachineOperand &MOP : MI.operands()) {
	if (MOP.isGlobal()) {
	Callee = dyn_cast<Function>(MOP.getGlobal());
	break;
	}
	}

	// Never outline calls to mcount. There isn't any rule that would require
	// this, but the Linux kernel's "ftrace" feature depends on it.
	if (Callee && Callee->getName() == "\01_mcount")
	return outliner::InstrType::Illegal;

	// If we don't know anything about the callee, assume it depends on the
	// stack layout of the caller. In that case, it's only legal to outline
	// as a tail-call. Whitelist the call instructions we know about so we
	// don't get unexpected results with call pseudo-instructions.
	auto UnknownCallOutlineType = outliner::InstrType::Illegal;
	if (MI.getOpcode() == AArch64::BLR \|\| MI.getOpcode() == AArch64::BL)
	UnknownCallOutlineType = outliner::InstrType::LegalTerminator;

	if (!Callee)
	return UnknownCallOutlineType;

	// We have a function we have information about. Check it if it's something
	// can safely outline.
	MachineFunction CalleeMF = MF->getMMI().getMachineFunction(Callee);

	// We don't know what's going on with the callee at all. Don't touch it.
	if (!CalleeMF)
	return UnknownCallOutlineType;

	// Check if we know anything about the callee saves on the function. If we
	// don't, then don't touch it, since that implies that we haven't
	// computed anything about its stack frame yet.
	MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
	if (!MFI.isCalleeSavedInfoValid() \|\| MFI.getStackSize() > 0 \|\|
	MFI.getNumObjects() > 0)
	return UnknownCallOutlineType;

	// At this point, we can say that CalleeMF ought to not pass anything on the
	// stack. Therefore, we can outline it.
	return outliner::InstrType::Legal;
	}

	// Don't outline positions.
	if (MI.isPosition())
	return outliner::InstrType::Illegal;

	// Don't touch the link register or W30.
	if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) \|\|
	MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
	return outliner::InstrType::Illegal;

	// Don't outline BTI instructions, because that will prevent the outlining
	// site from being indirectly callable.
	if (MI.getOpcode() == AArch64::HINT) {
	int64_t Imm = MI.getOperand(0).getImm();
	if (Imm == 32 \|\| Imm == 34 \|\| Imm == 36 \|\| Imm == 38)
	return outliner::InstrType::Illegal;
	}

	return outliner::InstrType::Legal;
	}

	void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
	for (MachineInstr &MI : MBB) {
	const MachineOperand *Base;
	unsigned Width;
	int64_t Offset;

	// Is this a load or store with an immediate offset with SP as the base?
	if (!MI.mayLoadOrStore() \|\|
	!getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) \|\|
	(Base->isReg() && Base->getReg() != AArch64::SP))
	continue;

	// It is, so we have to fix it up.
	unsigned Scale;
	int64_t Dummy1, Dummy2;

	MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
	assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
	getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
	assert(Scale != 0 && "Unexpected opcode!");

	// We've pushed the return address to the stack, so add 16 to the offset.
	// This is safe, since we already checked if it would overflow when we
	// checked if this instruction was legal to outline.
	int64_t NewImm = (Offset + 16) / Scale;
	StackOffsetOperand.setImm(NewImm);
	}
	}

	void AArch64InstrInfo::buildOutlinedFrame(
	MachineBasicBlock &MBB, MachineFunction &MF,
	const outliner::OutlinedFunction &OF) const {
	// For thunk outlining, rewrite the last instruction from a call to a
	// tail-call.
	if (OF.FrameConstructionID == MachineOutlinerThunk) {
	MachineInstr Call = &--MBB.instr_end();
	unsigned TailOpcode;
	if (Call->getOpcode() == AArch64::BL) {
	TailOpcode = AArch64::TCRETURNdi;
	} else {
	assert(Call->getOpcode() == AArch64::BLR);
	TailOpcode = AArch64::TCRETURNriALL;
	}
	MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
	.add(Call->getOperand(0))
	.addImm(0);
	MBB.insert(MBB.end(), TC);
	Call->eraseFromParent();
	}

	// Is there a call in the outlined range?
	auto IsNonTailCall = [](MachineInstr &MI) {
	return MI.isCall() && !MI.isReturn();
	};
	if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
	// Fix up the instructions in the range, since we're going to modify the
	// stack.
	assert(OF.FrameConstructionID != MachineOutlinerDefault &&
	"Can only fix up stack references once");
	fixupPostOutline(MBB);

	// LR has to be a live in so that we can save it.
	MBB.addLiveIn(AArch64::LR);

	MachineBasicBlock::iterator It = MBB.begin();
	MachineBasicBlock::iterator Et = MBB.end();

	if (OF.FrameConstructionID == MachineOutlinerTailCall \|\|
	OF.FrameConstructionID == MachineOutlinerThunk)
	Et = std::prev(MBB.end());

	// Insert a save before the outlined region
	MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(AArch64::LR)
	.addReg(AArch64::SP)
	.addImm(-16);
	It = MBB.insert(It, STRXpre);

	const TargetSubtargetInfo &STI = MF.getSubtarget();
	const MCRegisterInfo *MRI = STI.getRegisterInfo();
	unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);

	// Add a CFI saying the stack was moved 16 B down.
	int64_t StackPosEntry =
	MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
	BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
	.addCFIIndex(StackPosEntry)
	.setMIFlags(MachineInstr::FrameSetup);

	// Add a CFI saying that the LR that we want to find is now 16 B higher than
	// before.
	int64_t LRPosEntry =
	MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
	BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
	.addCFIIndex(LRPosEntry)
	.setMIFlags(MachineInstr::FrameSetup);

	// Insert a restore before the terminator for the function.
	MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(AArch64::LR, RegState::Define)
	.addReg(AArch64::SP)
	.addImm(16);
	Et = MBB.insert(Et, LDRXpost);
	}

	// If this is a tail call outlined function, then there's already a return.
	if (OF.FrameConstructionID == MachineOutlinerTailCall \|\|
	OF.FrameConstructionID == MachineOutlinerThunk)
	return;

	// It's not a tail call, so we have to insert the return ourselves.
	MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
	.addReg(AArch64::LR, RegState::Undef);
	MBB.insert(MBB.end(), ret);

	// Did we have to modify the stack by saving the link register?
	if (OF.FrameConstructionID != MachineOutlinerDefault)
	return;

	// We modified the stack.
	// Walk over the basic block and fix up all the stack accesses.
	fixupPostOutline(MBB);
	}

	MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
	Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
	MachineFunction &MF, const outliner::Candidate &C) const {

	// Are we tail calling?
	if (C.CallConstructionID == MachineOutlinerTailCall) {
	// If yes, then we can just branch to the label.
	It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
	.addGlobalAddress(M.getNamedValue(MF.getName()))
	.addImm(0));
	return It;
	}

	// Are we saving the link register?
	if (C.CallConstructionID == MachineOutlinerNoLRSave \|\|
	C.CallConstructionID == MachineOutlinerThunk) {
	// No, so just insert the call.
	It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
	.addGlobalAddress(M.getNamedValue(MF.getName())));
	return It;
	}

	// We want to return the spot where we inserted the call.
	MachineBasicBlock::iterator CallPt;

	// Instructions for saving and restoring LR around the call instruction we're
	// going to insert.
	MachineInstr *Save;
	MachineInstr *Restore;
	// Can we save to a register?
	if (C.CallConstructionID == MachineOutlinerRegSave) {
	// FIXME: This logic should be sunk into a target-specific interface so that
	// we don't have to recompute the register.
	unsigned Reg = findRegisterToSaveLRTo(C);
	assert(Reg != 0 && "No callee-saved register available?");

	// Save and restore LR from that register.
	Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
	.addReg(AArch64::XZR)
	.addReg(AArch64::LR)
	.addImm(0);
	Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
	.addReg(AArch64::XZR)
	.addReg(Reg)
	.addImm(0);
	} else {
	// We have the default case. Save and restore from SP.
	Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(AArch64::LR)
	.addReg(AArch64::SP)
	.addImm(-16);
	Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
	.addReg(AArch64::SP, RegState::Define)
	.addReg(AArch64::LR, RegState::Define)
	.addReg(AArch64::SP)
	.addImm(16);
	}

	It = MBB.insert(It, Save);
	It++;

	// Insert the call.
	It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
	.addGlobalAddress(M.getNamedValue(MF.getName())));
	CallPt = It;
	It++;

	It = MBB.insert(It, Restore);
	return CallPt;
	}

	bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
	MachineFunction &MF) const {
	return MF.getFunction().hasMinSize();
	}

	bool AArch64InstrInfo::isCopyInstrImpl(
	const MachineInstr &MI, const MachineOperand *&Source,
	const MachineOperand *&Destination) const {

	// AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
	// and zero immediate operands used as an alias for mov instruction.
	if (MI.getOpcode() == AArch64::ORRWrs &&
	MI.getOperand(1).getReg() == AArch64::WZR &&
	MI.getOperand(3).getImm() == 0x0) {
	Destination = &MI.getOperand(0);
	Source = &MI.getOperand(2);
	return true;
	}

	if (MI.getOpcode() == AArch64::ORRXrs &&
	MI.getOperand(1).getReg() == AArch64::XZR &&
	MI.getOperand(3).getImm() == 0x0) {
	Destination = &MI.getOperand(0);
	Source = &MI.getOperand(2);
	return true;
	}

	return false;
	}

	#define GET_INSTRINFO_HELPERS
	#include "AArch64GenInstrInfo.inc"
	Index: projects/clang900-import/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp (revision 351722)
	@@ -1,8467 +1,8484 @@
	//===-- MipsAsmParser.cpp - Parse Mips assembly to MCInst instructions ----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "MCTargetDesc/MipsABIFlagsSection.h"
	#include "MCTargetDesc/MipsABIInfo.h"
	#include "MCTargetDesc/MipsBaseInfo.h"
	#include "MCTargetDesc/MipsMCExpr.h"
	#include "MCTargetDesc/MipsMCTargetDesc.h"
	#include "MipsTargetStreamer.h"
	#include "TargetInfo/MipsTargetInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/ELF.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCParser/MCAsmLexer.h"
	#include "llvm/MC/MCParser/MCAsmParser.h"
	#include "llvm/MC/MCParser/MCAsmParserExtension.h"
	#include "llvm/MC/MCParser/MCAsmParserUtils.h"
	#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
	#include "llvm/MC/MCParser/MCTargetAsmParser.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCSymbolELF.h"
	#include "llvm/MC/MCValue.h"
	#include "llvm/MC/SubtargetFeature.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/SMLoc.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <memory>
	#include <string>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "mips-asm-parser"

	namespace llvm {

	class MCInstrInfo;

	} // end namespace llvm

	extern cl::opt<bool> EmitJalrReloc;

	namespace {

	class MipsAssemblerOptions {
	public:
	MipsAssemblerOptions(const FeatureBitset &Features_) : Features(Features_) {}

	MipsAssemblerOptions(const MipsAssemblerOptions *Opts) {
	ATReg = Opts->getATRegIndex();
	Reorder = Opts->isReorder();
	Macro = Opts->isMacro();
	Features = Opts->getFeatures();
	}

	unsigned getATRegIndex() const { return ATReg; }
	bool setATRegIndex(unsigned Reg) {
	if (Reg > 31)
	return false;

	ATReg = Reg;
	return true;
	}

	bool isReorder() const { return Reorder; }
	void setReorder() { Reorder = true; }
	void setNoReorder() { Reorder = false; }

	bool isMacro() const { return Macro; }
	void setMacro() { Macro = true; }
	void setNoMacro() { Macro = false; }

	const FeatureBitset &getFeatures() const { return Features; }
	void setFeatures(const FeatureBitset &Features_) { Features = Features_; }

	// Set of features that are either architecture features or referenced
	// by them (e.g.: FeatureNaN2008 implied by FeatureMips32r6).
	// The full table can be found in MipsGenSubtargetInfo.inc (MipsFeatureKV[]).
	// The reason we need this mask is explained in the selectArch function.
	// FIXME: Ideally we would like TableGen to generate this information.
	static const FeatureBitset AllArchRelatedMask;

	private:
	unsigned ATReg = 1;
	bool Reorder = true;
	bool Macro = true;
	FeatureBitset Features;
	};

	} // end anonymous namespace

	const FeatureBitset MipsAssemblerOptions::AllArchRelatedMask = {
	Mips::FeatureMips1, Mips::FeatureMips2, Mips::FeatureMips3,
	Mips::FeatureMips3_32, Mips::FeatureMips3_32r2, Mips::FeatureMips4,
	Mips::FeatureMips4_32, Mips::FeatureMips4_32r2, Mips::FeatureMips5,
	Mips::FeatureMips5_32r2, Mips::FeatureMips32, Mips::FeatureMips32r2,
	Mips::FeatureMips32r3, Mips::FeatureMips32r5, Mips::FeatureMips32r6,
	Mips::FeatureMips64, Mips::FeatureMips64r2, Mips::FeatureMips64r3,
	Mips::FeatureMips64r5, Mips::FeatureMips64r6, Mips::FeatureCnMips,
	Mips::FeatureFP64Bit, Mips::FeatureGP64Bit, Mips::FeatureNaN2008
	};

	namespace {

	class MipsAsmParser : public MCTargetAsmParser {
	MipsTargetStreamer &getTargetStreamer() {
	MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
	return static_cast<MipsTargetStreamer &>(TS);
	}

	MipsABIInfo ABI;
	SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions;
	MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a
	// nullptr, which indicates that no function is currently
	// selected. This usually happens after an '.end func'
	// directive.
	bool IsLittleEndian;
	bool IsPicEnabled;
	bool IsCpRestoreSet;
	int CpRestoreOffset;
	unsigned GPReg;
	unsigned CpSaveLocation;
	/// If true, then CpSaveLocation is a register, otherwise it's an offset.
	bool CpSaveLocationIsRegister;

	// Map of register aliases created via the .set directive.
	StringMap<AsmToken> RegisterSets;

	// Print a warning along with its fix-it message at the given range.
	void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
	SMRange Range, bool ShowColors = true);

	void ConvertXWPOperands(MCInst &Inst, const OperandVector &Operands);

	#define GET_ASSEMBLER_HEADER
	#include "MipsGenAsmMatcher.inc"

	unsigned
	checkEarlyTargetMatchPredicate(MCInst &Inst,
	const OperandVector &Operands) override;
	unsigned checkTargetMatchPredicate(MCInst &Inst) override;

	bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands, MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) override;

	/// Parse a register as used in CFI directives
	bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;

	bool parseParenSuffix(StringRef Name, OperandVector &Operands);

	bool parseBracketSuffix(StringRef Name, OperandVector &Operands);

	bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID);

	bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
	SMLoc NameLoc, OperandVector &Operands) override;

	bool ParseDirective(AsmToken DirectiveID) override;

	OperandMatchResultTy parseMemOperand(OperandVector &Operands);
	OperandMatchResultTy
	matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
	StringRef Identifier, SMLoc S);
	OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
	const AsmToken &Token,
	SMLoc S);
	OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
	SMLoc S);
	OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
	OperandMatchResultTy parseImm(OperandVector &Operands);
	OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
	OperandMatchResultTy parseInvNum(OperandVector &Operands);
	OperandMatchResultTy parseRegisterList(OperandVector &Operands);

	bool searchSymbolAlias(OperandVector &Operands);

	bool parseOperand(OperandVector &, StringRef Mnemonic);

	enum MacroExpanderResultTy {
	MER_NotAMacro,
	MER_Success,
	MER_Fail,
	};

	// Expands assembly pseudo instructions.
	MacroExpanderResultTy tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg,
	bool Is32BitImm, bool IsAddress, SMLoc IDLoc,
	MCStreamer &Out, const MCSubtargetInfo *STI);

	bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg,
	unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc,
	MCStreamer &Out, const MCSubtargetInfo *STI);

	bool emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc, MCSymbol *Sym);

	bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
	MCStreamer &Out, const MCSubtargetInfo *STI);

	bool expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, bool Is64FPU,
	SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
	const MCOperand &Offset, bool Is32BitAddress,
	SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	void expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI, bool IsLoad);

	bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI, const bool IsMips64,
	const bool Signed);

	bool expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU, SMLoc IDLoc,
	MCStreamer &Out, const MCSubtargetInfo *STI);

	bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandRotation(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out, const MCSubtargetInfo *STI);
	bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);
	bool expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);
	bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI, bool IsLoad);

	bool expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	bool reportParseError(Twine ErrorMsg);
	bool reportParseError(SMLoc Loc, Twine ErrorMsg);

	bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);

	bool isEvaluated(const MCExpr *Expr);
	bool parseSetMips0Directive();
	bool parseSetArchDirective();
	bool parseSetFeature(uint64_t Feature);
	bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup.
	bool parseDirectiveCpLoad(SMLoc Loc);
	bool parseDirectiveCpLocal(SMLoc Loc);
	bool parseDirectiveCpRestore(SMLoc Loc);
	bool parseDirectiveCPSetup();
	bool parseDirectiveCPReturn();
	bool parseDirectiveNaN();
	bool parseDirectiveSet();
	bool parseDirectiveOption();
	bool parseInsnDirective();
	bool parseRSectionDirective(StringRef Section);
	bool parseSSectionDirective(StringRef Section, unsigned Type);

	bool parseSetAtDirective();
	bool parseSetNoAtDirective();
	bool parseSetMacroDirective();
	bool parseSetNoMacroDirective();
	bool parseSetMsaDirective();
	bool parseSetNoMsaDirective();
	bool parseSetNoDspDirective();
	bool parseSetReorderDirective();
	bool parseSetNoReorderDirective();
	bool parseSetMips16Directive();
	bool parseSetNoMips16Directive();
	bool parseSetFpDirective();
	bool parseSetOddSPRegDirective();
	bool parseSetNoOddSPRegDirective();
	bool parseSetPopDirective();
	bool parseSetPushDirective();
	bool parseSetSoftFloatDirective();
	bool parseSetHardFloatDirective();
	bool parseSetMtDirective();
	bool parseSetNoMtDirective();
	bool parseSetNoCRCDirective();
	bool parseSetNoVirtDirective();
	bool parseSetNoGINVDirective();

	bool parseSetAssignment();

	bool parseDirectiveGpWord();
	bool parseDirectiveGpDWord();
	bool parseDirectiveDtpRelWord();
	bool parseDirectiveDtpRelDWord();
	bool parseDirectiveTpRelWord();
	bool parseDirectiveTpRelDWord();
	bool parseDirectiveModule();
	bool parseDirectiveModuleFP();
	bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
	StringRef Directive);

	bool parseInternalDirectiveReallowModule();

	bool eatComma(StringRef ErrorStr);

	int matchCPURegisterName(StringRef Symbol);

	int matchHWRegsRegisterName(StringRef Symbol);

	int matchFPURegisterName(StringRef Name);

	int matchFCCRegisterName(StringRef Name);

	int matchACRegisterName(StringRef Name);

	int matchMSA128RegisterName(StringRef Name);

	int matchMSA128CtrlRegisterName(StringRef Name);

	unsigned getReg(int RC, int RegNo);

	/// Returns the internal register number for the current AT. Also checks if
	/// the current AT is unavailable (set to $0) and gives an error if it is.
	/// This should be used in pseudo-instruction expansions which need AT.
	unsigned getATReg(SMLoc Loc);

	bool canUseATReg();

	bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI);

	// Helper function that checks if the value of a vector index is within the
	// boundaries of accepted values for each RegisterKind
	// Example: INSERT.B $w0[n], $1 => 16 > n >= 0
	bool validateMSAIndex(int Val, int RegKind);

	// Selects a new architecture by updating the FeatureBits with the necessary
	// info including implied dependencies.
	// Internally, it clears all the feature bits related to any architecture
	// and selects the new one using the ToggleFeature functionality of the
	// MCSubtargetInfo object that handles implied dependencies. The reason we
	// clear all the arch related bits manually is because ToggleFeature only
	// clears the features that imply the feature being cleared and not the
	// features implied by the feature being cleared. This is easier to see
	// with an example:
	// --------------------------------------------------
	// \| Feature \| Implies \|
	// \| -------------------------------------------------\|
	// \| FeatureMips1 \| None \|
	// \| FeatureMips2 \| FeatureMips1 \|
	// \| FeatureMips3 \| FeatureMips2 \| FeatureMipsGP64 \|
	// \| FeatureMips4 \| FeatureMips3 \|
	// \| ... \| \|
	// --------------------------------------------------
	//
	// Setting Mips3 is equivalent to set: (FeatureMips3 \| FeatureMips2 \|
	// FeatureMipsGP64 \| FeatureMips1)
	// Clearing Mips3 is equivalent to clear (FeatureMips3 \| FeatureMips4).
	void selectArch(StringRef ArchFeature) {
	MCSubtargetInfo &STI = copySTI();
	FeatureBitset FeatureBits = STI.getFeatureBits();
	FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask;
	STI.setFeatureBits(FeatureBits);
	setAvailableFeatures(
	ComputeAvailableFeatures(STI.ToggleFeature(ArchFeature)));
	AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
	}

	void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
	if (!(getSTI().getFeatureBits()[Feature])) {
	MCSubtargetInfo &STI = copySTI();
	setAvailableFeatures(
	ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
	AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
	}
	}

	void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
	if (getSTI().getFeatureBits()[Feature]) {
	MCSubtargetInfo &STI = copySTI();
	setAvailableFeatures(
	ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
	AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
	}
	}

	void setModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
	setFeatureBits(Feature, FeatureString);
	AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
	}

	void clearModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
	clearFeatureBits(Feature, FeatureString);
	AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
	}

	public:
	enum MipsMatchResultTy {
	Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY,
	Match_RequiresDifferentOperands,
	Match_RequiresNoZeroRegister,
	Match_RequiresSameSrcAndDst,
	Match_NoFCCRegisterForCurrentISA,
	Match_NonZeroOperandForSync,
	Match_NonZeroOperandForMTCX,
	Match_RequiresPosSizeRange0_32,
	Match_RequiresPosSizeRange33_64,
	Match_RequiresPosSizeUImm6,
	#define GET_OPERAND_DIAGNOSTIC_TYPES
	#include "MipsGenAsmMatcher.inc"
	#undef GET_OPERAND_DIAGNOSTIC_TYPES
	};

	MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
	const MCInstrInfo &MII, const MCTargetOptions &Options)
	: MCTargetAsmParser(Options, sti, MII),
	ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()),
	sti.getCPU(), Options)) {
	MCAsmParserExtension::Initialize(parser);

	parser.addAliasForDirective(".asciiz", ".asciz");
	parser.addAliasForDirective(".hword", ".2byte");
	parser.addAliasForDirective(".word", ".4byte");
	parser.addAliasForDirective(".dword", ".8byte");

	// Initialize the set of available features.
	setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));

	// Remember the initial assembler options. The user can not modify these.
	AssemblerOptions.push_back(
	llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));

	// Create an assembler options environment for the user to modify.
	AssemblerOptions.push_back(
	llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));

	getTargetStreamer().updateABIInfo(*this);

	if (!isABI_O32() && !useOddSPReg() != 0)
	report_fatal_error("-mno-odd-spreg requires the O32 ABI");

	CurrentFn = nullptr;

	IsPicEnabled = getContext().getObjectFileInfo()->isPositionIndependent();

	IsCpRestoreSet = false;
	CpRestoreOffset = -1;
	GPReg = ABI.GetGlobalPtr();

	const Triple &TheTriple = sti.getTargetTriple();
	IsLittleEndian = TheTriple.isLittleEndian();

	if (getSTI().getCPU() == "mips64r6" && inMicroMipsMode())
	report_fatal_error("microMIPS64R6 is not supported", false);

	if (!isABI_O32() && inMicroMipsMode())
	report_fatal_error("microMIPS64 is not supported", false);
	}

	/// True if all of $fcc0 - $fcc7 exist for the current ISA.
	bool hasEightFccRegisters() const { return hasMips4() \|\| hasMips32(); }

	bool isGP64bit() const {
	return getSTI().getFeatureBits()[Mips::FeatureGP64Bit];
	}

	bool isFP64bit() const {
	return getSTI().getFeatureBits()[Mips::FeatureFP64Bit];
	}

	const MipsABIInfo &getABI() const { return ABI; }
	bool isABI_N32() const { return ABI.IsN32(); }
	bool isABI_N64() const { return ABI.IsN64(); }
	bool isABI_O32() const { return ABI.IsO32(); }
	bool isABI_FPXX() const {
	return getSTI().getFeatureBits()[Mips::FeatureFPXX];
	}

	bool useOddSPReg() const {
	return !(getSTI().getFeatureBits()[Mips::FeatureNoOddSPReg]);
	}

	bool inMicroMipsMode() const {
	return getSTI().getFeatureBits()[Mips::FeatureMicroMips];
	}

	bool hasMips1() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips1];
	}

	bool hasMips2() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips2];
	}

	bool hasMips3() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips3];
	}

	bool hasMips4() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips4];
	}

	bool hasMips5() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips5];
	}

	bool hasMips32() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips32];
	}

	bool hasMips64() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips64];
	}

	bool hasMips32r2() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips32r2];
	}

	bool hasMips64r2() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips64r2];
	}

	bool hasMips32r3() const {
	return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]);
	}

	bool hasMips64r3() const {
	return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]);
	}

	bool hasMips32r5() const {
	return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]);
	}

	bool hasMips64r5() const {
	return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]);
	}

	bool hasMips32r6() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips32r6];
	}

	bool hasMips64r6() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips64r6];
	}

	bool hasDSP() const {
	return getSTI().getFeatureBits()[Mips::FeatureDSP];
	}

	bool hasDSPR2() const {
	return getSTI().getFeatureBits()[Mips::FeatureDSPR2];
	}

	bool hasDSPR3() const {
	return getSTI().getFeatureBits()[Mips::FeatureDSPR3];
	}

	bool hasMSA() const {
	return getSTI().getFeatureBits()[Mips::FeatureMSA];
	}

	bool hasCnMips() const {
	return (getSTI().getFeatureBits()[Mips::FeatureCnMips]);
	}

	bool inPicMode() {
	return IsPicEnabled;
	}

	bool inMips16Mode() const {
	return getSTI().getFeatureBits()[Mips::FeatureMips16];
	}

	bool useTraps() const {
	return getSTI().getFeatureBits()[Mips::FeatureUseTCCInDIV];
	}

	bool useSoftFloat() const {
	return getSTI().getFeatureBits()[Mips::FeatureSoftFloat];
	}
	bool hasMT() const {
	return getSTI().getFeatureBits()[Mips::FeatureMT];
	}

	bool hasCRC() const {
	return getSTI().getFeatureBits()[Mips::FeatureCRC];
	}

	bool hasVirt() const {
	return getSTI().getFeatureBits()[Mips::FeatureVirt];
	}

	bool hasGINV() const {
	return getSTI().getFeatureBits()[Mips::FeatureGINV];
	}

	/// Warn if RegIndex is the same as the current AT.
	void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);

	void warnIfNoMacro(SMLoc Loc);

	bool isLittle() const { return IsLittleEndian; }

	const MCExpr createTargetUnaryExpr(const MCExpr E,
	AsmToken::TokenKind OperatorToken,
	MCContext &Ctx) override {
	switch(OperatorToken) {
	default:
	llvm_unreachable("Unknown token");
	return nullptr;
	case AsmToken::PercentCall16:
	return MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, E, Ctx);
	case AsmToken::PercentCall_Hi:
	return MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16, E, Ctx);
	case AsmToken::PercentCall_Lo:
	return MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16, E, Ctx);
	case AsmToken::PercentDtprel_Hi:
	return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL_HI, E, Ctx);
	case AsmToken::PercentDtprel_Lo:
	return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL_LO, E, Ctx);
	case AsmToken::PercentGot:
	return MipsMCExpr::create(MipsMCExpr::MEK_GOT, E, Ctx);
	case AsmToken::PercentGot_Disp:
	return MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, E, Ctx);
	case AsmToken::PercentGot_Hi:
	return MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, E, Ctx);
	case AsmToken::PercentGot_Lo:
	return MipsMCExpr::create(MipsMCExpr::MEK_GOT_LO16, E, Ctx);
	case AsmToken::PercentGot_Ofst:
	return MipsMCExpr::create(MipsMCExpr::MEK_GOT_OFST, E, Ctx);
	case AsmToken::PercentGot_Page:
	return MipsMCExpr::create(MipsMCExpr::MEK_GOT_PAGE, E, Ctx);
	case AsmToken::PercentGottprel:
	return MipsMCExpr::create(MipsMCExpr::MEK_GOTTPREL, E, Ctx);
	case AsmToken::PercentGp_Rel:
	return MipsMCExpr::create(MipsMCExpr::MEK_GPREL, E, Ctx);
	case AsmToken::PercentHi:
	return MipsMCExpr::create(MipsMCExpr::MEK_HI, E, Ctx);
	case AsmToken::PercentHigher:
	return MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, E, Ctx);
	case AsmToken::PercentHighest:
	return MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, E, Ctx);
	case AsmToken::PercentLo:
	return MipsMCExpr::create(MipsMCExpr::MEK_LO, E, Ctx);
	case AsmToken::PercentNeg:
	return MipsMCExpr::create(MipsMCExpr::MEK_NEG, E, Ctx);
	case AsmToken::PercentPcrel_Hi:
	return MipsMCExpr::create(MipsMCExpr::MEK_PCREL_HI16, E, Ctx);
	case AsmToken::PercentPcrel_Lo:
	return MipsMCExpr::create(MipsMCExpr::MEK_PCREL_LO16, E, Ctx);
	case AsmToken::PercentTlsgd:
	return MipsMCExpr::create(MipsMCExpr::MEK_TLSGD, E, Ctx);
	case AsmToken::PercentTlsldm:
	return MipsMCExpr::create(MipsMCExpr::MEK_TLSLDM, E, Ctx);
	case AsmToken::PercentTprel_Hi:
	return MipsMCExpr::create(MipsMCExpr::MEK_TPREL_HI, E, Ctx);
	case AsmToken::PercentTprel_Lo:
	return MipsMCExpr::create(MipsMCExpr::MEK_TPREL_LO, E, Ctx);
	}
	}
	};

	/// MipsOperand - Instances of this class represent a parsed Mips machine
	/// instruction.
	class MipsOperand : public MCParsedAsmOperand {
	public:
	/// Broad categories of register classes
	/// The exact class is finalized by the render method.
	enum RegKind {
	RegKind_GPR = 1, /// GPR32 and GPR64 (depending on isGP64bit())
	RegKind_FGR = 2, /// FGR32, FGR64, AFGR64 (depending on context and
	/// isFP64bit())
	RegKind_FCC = 4, /// FCC
	RegKind_MSA128 = 8, /// MSA128[BHWD] (makes no difference which)
	RegKind_MSACtrl = 16, /// MSA control registers
	RegKind_COP2 = 32, /// COP2
	RegKind_ACC = 64, /// HI32DSP, LO32DSP, and ACC64DSP (depending on
	/// context).
	RegKind_CCR = 128, /// CCR
	RegKind_HWRegs = 256, /// HWRegs
	RegKind_COP3 = 512, /// COP3
	RegKind_COP0 = 1024, /// COP0
	/// Potentially any (e.g. $1)
	RegKind_Numeric = RegKind_GPR \| RegKind_FGR \| RegKind_FCC \| RegKind_MSA128 \|
	RegKind_MSACtrl \| RegKind_COP2 \| RegKind_ACC \|
	RegKind_CCR \| RegKind_HWRegs \| RegKind_COP3 \| RegKind_COP0
	};

	private:
	enum KindTy {
	k_Immediate, /// An immediate (possibly involving symbol references)
	k_Memory, /// Base + Offset Memory Address
	k_RegisterIndex, /// A register index in one or more RegKind.
	k_Token, /// A simple token
	k_RegList, /// A physical register list
	} Kind;

	public:
	MipsOperand(KindTy K, MipsAsmParser &Parser)
	: MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}

	~MipsOperand() override {
	switch (Kind) {
	case k_Memory:
	delete Mem.Base;
	break;
	case k_RegList:
	delete RegList.List;
	break;
	case k_Immediate:
	case k_RegisterIndex:
	case k_Token:
	break;
	}
	}

	private:
	/// For diagnostics, and checking the assembler temporary
	MipsAsmParser &AsmParser;

	struct Token {
	const char *Data;
	unsigned Length;
	};

	struct RegIdxOp {
	unsigned Index; /// Index into the register class
	RegKind Kind; /// Bitfield of the kinds it could possibly be
	struct Token Tok; /// The input token this operand originated from.
	const MCRegisterInfo *RegInfo;
	};

	struct ImmOp {
	const MCExpr *Val;
	};

	struct MemOp {
	MipsOperand *Base;
	const MCExpr *Off;
	};

	struct RegListOp {
	SmallVector<unsigned, 10> *List;
	};

	union {
	struct Token Tok;
	struct RegIdxOp RegIdx;
	struct ImmOp Imm;
	struct MemOp Mem;
	struct RegListOp RegList;
	};

	SMLoc StartLoc, EndLoc;

	/// Internal constructor for register kinds
	static std::unique_ptr<MipsOperand> CreateReg(unsigned Index, StringRef Str,
	RegKind RegKind,
	const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E,
	MipsAsmParser &Parser) {
	auto Op = llvm::make_unique<MipsOperand>(k_RegisterIndex, Parser);
	Op->RegIdx.Index = Index;
	Op->RegIdx.RegInfo = RegInfo;
	Op->RegIdx.Kind = RegKind;
	Op->RegIdx.Tok.Data = Str.data();
	Op->RegIdx.Tok.Length = Str.size();
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	public:
	/// Coerce the register to GPR32 and return the real register for the current
	/// target.
	unsigned getGPR32Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
	AsmParser.warnIfRegIndexIsAT(RegIdx.Index, StartLoc);
	unsigned ClassID = Mips::GPR32RegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to GPR32 and return the real register for the current
	/// target.
	unsigned getGPRMM16Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
	unsigned ClassID = Mips::GPR32RegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to GPR64 and return the real register for the current
	/// target.
	unsigned getGPR64Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
	unsigned ClassID = Mips::GPR64RegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	private:
	/// Coerce the register to AFGR64 and return the real register for the current
	/// target.
	unsigned getAFGR64Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
	if (RegIdx.Index % 2 != 0)
	AsmParser.Warning(StartLoc, "Float register should be even.");
	return RegIdx.RegInfo->getRegClass(Mips::AFGR64RegClassID)
	.getRegister(RegIdx.Index / 2);
	}

	/// Coerce the register to FGR64 and return the real register for the current
	/// target.
	unsigned getFGR64Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
	return RegIdx.RegInfo->getRegClass(Mips::FGR64RegClassID)
	.getRegister(RegIdx.Index);
	}

	/// Coerce the register to FGR32 and return the real register for the current
	/// target.
	unsigned getFGR32Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
	return RegIdx.RegInfo->getRegClass(Mips::FGR32RegClassID)
	.getRegister(RegIdx.Index);
	}

	/// Coerce the register to FCC and return the real register for the current
	/// target.
	unsigned getFCCReg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_FCC) && "Invalid access!");
	return RegIdx.RegInfo->getRegClass(Mips::FCCRegClassID)
	.getRegister(RegIdx.Index);
	}

	/// Coerce the register to MSA128 and return the real register for the current
	/// target.
	unsigned getMSA128Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_MSA128) && "Invalid access!");
	// It doesn't matter which of the MSA128[BHWD] classes we use. They are all
	// identical
	unsigned ClassID = Mips::MSA128BRegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to MSACtrl and return the real register for the
	/// current target.
	unsigned getMSACtrlReg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_MSACtrl) && "Invalid access!");
	unsigned ClassID = Mips::MSACtrlRegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to COP0 and return the real register for the
	/// current target.
	unsigned getCOP0Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_COP0) && "Invalid access!");
	unsigned ClassID = Mips::COP0RegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to COP2 and return the real register for the
	/// current target.
	unsigned getCOP2Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_COP2) && "Invalid access!");
	unsigned ClassID = Mips::COP2RegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to COP3 and return the real register for the
	/// current target.
	unsigned getCOP3Reg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_COP3) && "Invalid access!");
	unsigned ClassID = Mips::COP3RegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to ACC64DSP and return the real register for the
	/// current target.
	unsigned getACC64DSPReg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
	unsigned ClassID = Mips::ACC64DSPRegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to HI32DSP and return the real register for the
	/// current target.
	unsigned getHI32DSPReg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
	unsigned ClassID = Mips::HI32DSPRegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to LO32DSP and return the real register for the
	/// current target.
	unsigned getLO32DSPReg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
	unsigned ClassID = Mips::LO32DSPRegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to CCR and return the real register for the
	/// current target.
	unsigned getCCRReg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_CCR) && "Invalid access!");
	unsigned ClassID = Mips::CCRRegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	/// Coerce the register to HWRegs and return the real register for the
	/// current target.
	unsigned getHWRegsReg() const {
	assert(isRegIdx() && (RegIdx.Kind & RegKind_HWRegs) && "Invalid access!");
	unsigned ClassID = Mips::HWRegsRegClassID;
	return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
	}

	public:
	void addExpr(MCInst &Inst, const MCExpr *Expr) const {
	// Add as immediate when possible. Null MCExpr = 0.
	if (!Expr)
	Inst.addOperand(MCOperand::createImm(0));
	else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
	Inst.addOperand(MCOperand::createImm(CE->getValue()));
	else
	Inst.addOperand(MCOperand::createExpr(Expr));
	}

	void addRegOperands(MCInst &Inst, unsigned N) const {
	llvm_unreachable("Use a custom parser instead");
	}

	/// Render the operand to an MCInst as a GPR32
	/// Asserts if the wrong number of operands are requested, or the operand
	/// is not a k_RegisterIndex compatible with RegKind_GPR
	void addGPR32ZeroAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
	}

	void addGPR32NonZeroAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
	}

	void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
	}

	void addGPRMM16AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
	}

	void addGPRMM16AsmRegZeroOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
	}

	void addGPRMM16AsmRegMovePOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
	}

	void addGPRMM16AsmRegMovePPairFirstOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
	}

	void addGPRMM16AsmRegMovePPairSecondOperands(MCInst &Inst,
	unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
	}

	/// Render the operand to an MCInst as a GPR64
	/// Asserts if the wrong number of operands are requested, or the operand
	/// is not a k_RegisterIndex compatible with RegKind_GPR
	void addGPR64AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getGPR64Reg()));
	}

	void addAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getAFGR64Reg()));
	}

	void addStrictlyAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getAFGR64Reg()));
	}

	void addStrictlyFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getFGR64Reg()));
	}

	void addFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getFGR64Reg()));
	}

	void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getFGR32Reg()));
	// FIXME: We ought to do this for -integrated-as without -via-file-asm too.
	// FIXME: This should propagate failure up to parseStatement.
	if (!AsmParser.useOddSPReg() && RegIdx.Index & 1)
	AsmParser.getParser().printError(
	StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
	"registers");
	}

	void addStrictlyFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getFGR32Reg()));
	// FIXME: We ought to do this for -integrated-as without -via-file-asm too.
	if (!AsmParser.useOddSPReg() && RegIdx.Index & 1)
	AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
	"registers");
	}

	void addFCCAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getFCCReg()));
	}

	void addMSA128AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getMSA128Reg()));
	}

	void addMSACtrlAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getMSACtrlReg()));
	}

	void addCOP0AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getCOP0Reg()));
	}

	void addCOP2AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getCOP2Reg()));
	}

	void addCOP3AsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getCOP3Reg()));
	}

	void addACC64DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getACC64DSPReg()));
	}

	void addHI32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getHI32DSPReg()));
	}

	void addLO32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getLO32DSPReg()));
	}

	void addCCRAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getCCRReg()));
	}

	void addHWRegsAsmRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getHWRegsReg()));
	}

	template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
	void addConstantUImmOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	uint64_t Imm = getConstantImm() - Offset;
	Imm &= (1ULL << Bits) - 1;
	Imm += Offset;
	Imm += AdjustOffset;
	Inst.addOperand(MCOperand::createImm(Imm));
	}

	template <unsigned Bits>
	void addSImmOperands(MCInst &Inst, unsigned N) const {
	if (isImm() && !isConstantImm()) {
	addExpr(Inst, getImm());
	return;
	}
	addConstantSImmOperands<Bits, 0, 0>(Inst, N);
	}

	template <unsigned Bits>
	void addUImmOperands(MCInst &Inst, unsigned N) const {
	if (isImm() && !isConstantImm()) {
	addExpr(Inst, getImm());
	return;
	}
	addConstantUImmOperands<Bits, 0, 0>(Inst, N);
	}

	template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
	void addConstantSImmOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	int64_t Imm = getConstantImm() - Offset;
	Imm = SignExtend64<Bits>(Imm);
	Imm += Offset;
	Imm += AdjustOffset;
	Inst.addOperand(MCOperand::createImm(Imm));
	}

	void addImmOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCExpr *Expr = getImm();
	addExpr(Inst, Expr);
	}

	void addMemOperands(MCInst &Inst, unsigned N) const {
	assert(N == 2 && "Invalid number of operands!");

	Inst.addOperand(MCOperand::createReg(AsmParser.getABI().ArePtrs64bit()
	? getMemBase()->getGPR64Reg()
	: getMemBase()->getGPR32Reg()));

	const MCExpr *Expr = getMemOff();
	addExpr(Inst, Expr);
	}

	void addMicroMipsMemOperands(MCInst &Inst, unsigned N) const {
	assert(N == 2 && "Invalid number of operands!");

	Inst.addOperand(MCOperand::createReg(getMemBase()->getGPRMM16Reg()));

	const MCExpr *Expr = getMemOff();
	addExpr(Inst, Expr);
	}

	void addRegListOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");

	for (auto RegNo : getRegList())
	Inst.addOperand(MCOperand::createReg(RegNo));
	}

	bool isReg() const override {
	// As a special case until we sort out the definition of div/divu, accept
	// $0/$zero here so that MCK_ZERO works correctly.
	return isGPRAsmReg() && RegIdx.Index == 0;
	}

	bool isRegIdx() const { return Kind == k_RegisterIndex; }
	bool isImm() const override { return Kind == k_Immediate; }

	bool isConstantImm() const {
	int64_t Res;
	return isImm() && getImm()->evaluateAsAbsolute(Res);
	}

	bool isConstantImmz() const {
	return isConstantImm() && getConstantImm() == 0;
	}

	template <unsigned Bits, int Offset = 0> bool isConstantUImm() const {
	return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset);
	}

	template <unsigned Bits> bool isSImm() const {
	return isConstantImm() ? isInt<Bits>(getConstantImm()) : isImm();
	}

	template <unsigned Bits> bool isUImm() const {
	return isConstantImm() ? isUInt<Bits>(getConstantImm()) : isImm();
	}

	template <unsigned Bits> bool isAnyImm() const {
	return isConstantImm() ? (isInt<Bits>(getConstantImm()) \|\|
	isUInt<Bits>(getConstantImm()))
	: isImm();
	}

	template <unsigned Bits, int Offset = 0> bool isConstantSImm() const {
	return isConstantImm() && isInt<Bits>(getConstantImm() - Offset);
	}

	template <unsigned Bottom, unsigned Top> bool isConstantUImmRange() const {
	return isConstantImm() && getConstantImm() >= Bottom &&
	getConstantImm() <= Top;
	}

	bool isToken() const override {
	// Note: It's not possible to pretend that other operand kinds are tokens.
	// The matcher emitter checks tokens first.
	return Kind == k_Token;
	}

	bool isMem() const override { return Kind == k_Memory; }

	bool isConstantMemOff() const {
	return isMem() && isa<MCConstantExpr>(getMemOff());
	}

	// Allow relocation operators.
	// FIXME: This predicate and others need to look through binary expressions
	// and determine whether a Value is a constant or not.
	template <unsigned Bits, unsigned ShiftAmount = 0>
	bool isMemWithSimmOffset() const {
	if (!isMem())
	return false;
	if (!getMemBase()->isGPRAsmReg())
	return false;
	if (isa<MCTargetExpr>(getMemOff()) \|\|
	(isConstantMemOff() &&
	isShiftedInt<Bits, ShiftAmount>(getConstantMemOff())))
	return true;
	MCValue Res;
	bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
	return IsReloc && isShiftedInt<Bits, ShiftAmount>(Res.getConstant());
	}

	bool isMemWithPtrSizeOffset() const {
	if (!isMem())
	return false;
	if (!getMemBase()->isGPRAsmReg())
	return false;
	const unsigned PtrBits = AsmParser.getABI().ArePtrs64bit() ? 64 : 32;
	if (isa<MCTargetExpr>(getMemOff()) \|\|
	(isConstantMemOff() && isIntN(PtrBits, getConstantMemOff())))
	return true;
	MCValue Res;
	bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
	return IsReloc && isIntN(PtrBits, Res.getConstant());
	}

	bool isMemWithGRPMM16Base() const {
	return isMem() && getMemBase()->isMM16AsmReg();
	}

	template <unsigned Bits> bool isMemWithUimmOffsetSP() const {
	return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
	&& getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP);
	}

	template <unsigned Bits> bool isMemWithUimmWordAlignedOffsetSP() const {
	return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
	&& (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
	&& (getMemBase()->getGPR32Reg() == Mips::SP);
	}

	template <unsigned Bits> bool isMemWithSimmWordAlignedOffsetGP() const {
	return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
	&& (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
	&& (getMemBase()->getGPR32Reg() == Mips::GP);
	}

	template <unsigned Bits, unsigned ShiftLeftAmount>
	bool isScaledUImm() const {
	return isConstantImm() &&
	isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm());
	}

	template <unsigned Bits, unsigned ShiftLeftAmount>
	bool isScaledSImm() const {
	if (isConstantImm() &&
	isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
	return true;
	// Operand can also be a symbol or symbol plus
	// offset in case of relocations.
	if (Kind != k_Immediate)
	return false;
	MCValue Res;
	bool Success = getImm()->evaluateAsRelocatable(Res, nullptr, nullptr);
	return Success && isShiftedInt<Bits, ShiftLeftAmount>(Res.getConstant());
	}

	bool isRegList16() const {
	if (!isRegList())
	return false;

	int Size = RegList.List->size();
	if (Size < 2 \|\| Size > 5)
	return false;

	unsigned R0 = RegList.List->front();
	unsigned R1 = RegList.List->back();
	if (!((R0 == Mips::S0 && R1 == Mips::RA) \|\|
	(R0 == Mips::S0_64 && R1 == Mips::RA_64)))
	return false;

	int PrevReg = *RegList.List->begin();
	for (int i = 1; i < Size - 1; i++) {
	int Reg = (*(RegList.List))[i];
	if ( Reg != PrevReg + 1)
	return false;
	PrevReg = Reg;
	}

	return true;
	}

	bool isInvNum() const { return Kind == k_Immediate; }

	bool isLSAImm() const {
	if (!isConstantImm())
	return false;
	int64_t Val = getConstantImm();
	return 1 <= Val && Val <= 4;
	}

	bool isRegList() const { return Kind == k_RegList; }

	StringRef getToken() const {
	assert(Kind == k_Token && "Invalid access!");
	return StringRef(Tok.Data, Tok.Length);
	}

	unsigned getReg() const override {
	// As a special case until we sort out the definition of div/divu, accept
	// $0/$zero here so that MCK_ZERO works correctly.
	if (Kind == k_RegisterIndex && RegIdx.Index == 0 &&
	RegIdx.Kind & RegKind_GPR)
	return getGPR32Reg(); // FIXME: GPR64 too

	llvm_unreachable("Invalid access!");
	return 0;
	}

	const MCExpr *getImm() const {
	assert((Kind == k_Immediate) && "Invalid access!");
	return Imm.Val;
	}

	int64_t getConstantImm() const {
	const MCExpr *Val = getImm();
	int64_t Value = 0;
	(void)Val->evaluateAsAbsolute(Value);
	return Value;
	}

	MipsOperand *getMemBase() const {
	assert((Kind == k_Memory) && "Invalid access!");
	return Mem.Base;
	}

	const MCExpr *getMemOff() const {
	assert((Kind == k_Memory) && "Invalid access!");
	return Mem.Off;
	}

	int64_t getConstantMemOff() const {
	return static_cast<const MCConstantExpr *>(getMemOff())->getValue();
	}

	const SmallVectorImpl<unsigned> &getRegList() const {
	assert((Kind == k_RegList) && "Invalid access!");
	return *(RegList.List);
	}

	static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
	MipsAsmParser &Parser) {
	auto Op = llvm::make_unique<MipsOperand>(k_Token, Parser);
	Op->Tok.Data = Str.data();
	Op->Tok.Length = Str.size();
	Op->StartLoc = S;
	Op->EndLoc = S;
	return Op;
	}

	/// Create a numeric register (e.g. $1). The exact register remains
	/// unresolved until an instruction successfully matches
	static std::unique_ptr<MipsOperand>
	createNumericReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	LLVM_DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n");
	return CreateReg(Index, Str, RegKind_Numeric, RegInfo, S, E, Parser);
	}

	/// Create a register that is definitely a GPR.
	/// This is typically only used for named registers such as $gp.
	static std::unique_ptr<MipsOperand>
	createGPRReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	return CreateReg(Index, Str, RegKind_GPR, RegInfo, S, E, Parser);
	}

	/// Create a register that is definitely a FGR.
	/// This is typically only used for named registers such as $f0.
	static std::unique_ptr<MipsOperand>
	createFGRReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	return CreateReg(Index, Str, RegKind_FGR, RegInfo, S, E, Parser);
	}

	/// Create a register that is definitely a HWReg.
	/// This is typically only used for named registers such as $hwr_cpunum.
	static std::unique_ptr<MipsOperand>
	createHWRegsReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	return CreateReg(Index, Str, RegKind_HWRegs, RegInfo, S, E, Parser);
	}

	/// Create a register that is definitely an FCC.
	/// This is typically only used for named registers such as $fcc0.
	static std::unique_ptr<MipsOperand>
	createFCCReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	return CreateReg(Index, Str, RegKind_FCC, RegInfo, S, E, Parser);
	}

	/// Create a register that is definitely an ACC.
	/// This is typically only used for named registers such as $ac0.
	static std::unique_ptr<MipsOperand>
	createACCReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	return CreateReg(Index, Str, RegKind_ACC, RegInfo, S, E, Parser);
	}

	/// Create a register that is definitely an MSA128.
	/// This is typically only used for named registers such as $w0.
	static std::unique_ptr<MipsOperand>
	createMSA128Reg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	return CreateReg(Index, Str, RegKind_MSA128, RegInfo, S, E, Parser);
	}

	/// Create a register that is definitely an MSACtrl.
	/// This is typically only used for named registers such as $msaaccess.
	static std::unique_ptr<MipsOperand>
	createMSACtrlReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
	SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	return CreateReg(Index, Str, RegKind_MSACtrl, RegInfo, S, E, Parser);
	}

	static std::unique_ptr<MipsOperand>
	CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
	auto Op = llvm::make_unique<MipsOperand>(k_Immediate, Parser);
	Op->Imm.Val = Val;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<MipsOperand>
	CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
	SMLoc E, MipsAsmParser &Parser) {
	auto Op = llvm::make_unique<MipsOperand>(k_Memory, Parser);
	Op->Mem.Base = Base.release();
	Op->Mem.Off = Off;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<MipsOperand>
	CreateRegList(SmallVectorImpl<unsigned> &Regs, SMLoc StartLoc, SMLoc EndLoc,
	MipsAsmParser &Parser) {
	assert(Regs.size() > 0 && "Empty list not allowed");

	auto Op = llvm::make_unique<MipsOperand>(k_RegList, Parser);
	Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
	Op->StartLoc = StartLoc;
	Op->EndLoc = EndLoc;
	return Op;
	}

	bool isGPRZeroAsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index == 0;
	}

	bool isGPRNonZeroAsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index > 0 &&
	RegIdx.Index <= 31;
	}

	bool isGPRAsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
	}

	bool isMM16AsmReg() const {
	if (!(isRegIdx() && RegIdx.Kind))
	return false;
	return ((RegIdx.Index >= 2 && RegIdx.Index <= 7)
	\|\| RegIdx.Index == 16 \|\| RegIdx.Index == 17);

	}
	bool isMM16AsmRegZero() const {
	if (!(isRegIdx() && RegIdx.Kind))
	return false;
	return (RegIdx.Index == 0 \|\|
	(RegIdx.Index >= 2 && RegIdx.Index <= 7) \|\|
	RegIdx.Index == 17);
	}

	bool isMM16AsmRegMoveP() const {
	if (!(isRegIdx() && RegIdx.Kind))
	return false;
	return (RegIdx.Index == 0 \|\| (RegIdx.Index >= 2 && RegIdx.Index <= 3) \|\|
	(RegIdx.Index >= 16 && RegIdx.Index <= 20));
	}

	bool isMM16AsmRegMovePPairFirst() const {
	if (!(isRegIdx() && RegIdx.Kind))
	return false;
	return RegIdx.Index >= 4 && RegIdx.Index <= 6;
	}

	bool isMM16AsmRegMovePPairSecond() const {
	if (!(isRegIdx() && RegIdx.Kind))
	return false;
	return (RegIdx.Index == 21 \|\| RegIdx.Index == 22 \|\|
	(RegIdx.Index >= 5 && RegIdx.Index <= 7));
	}

	bool isFGRAsmReg() const {
	// AFGR64 is $0-$15 but we handle this in getAFGR64()
	return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
	}

	bool isStrictlyFGRAsmReg() const {
	// AFGR64 is $0-$15 but we handle this in getAFGR64()
	return isRegIdx() && RegIdx.Kind == RegKind_FGR && RegIdx.Index <= 31;
	}

	bool isHWRegsAsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31;
	}

	bool isCCRAsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
	}

	bool isFCCAsmReg() const {
	if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC))
	return false;
	return RegIdx.Index <= 7;
	}

	bool isACCAsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
	}

	bool isCOP0AsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_COP0 && RegIdx.Index <= 31;
	}

	bool isCOP2AsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
	}

	bool isCOP3AsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31;
	}

	bool isMSA128AsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
	}

	bool isMSACtrlAsmReg() const {
	return isRegIdx() && RegIdx.Kind & RegKind_MSACtrl && RegIdx.Index <= 7;
	}

	/// getStartLoc - Get the location of the first token of this operand.
	SMLoc getStartLoc() const override { return StartLoc; }
	/// getEndLoc - Get the location of the last token of this operand.
	SMLoc getEndLoc() const override { return EndLoc; }

	void print(raw_ostream &OS) const override {
	switch (Kind) {
	case k_Immediate:
	OS << "Imm<";
	OS << *Imm.Val;
	OS << ">";
	break;
	case k_Memory:
	OS << "Mem<";
	Mem.Base->print(OS);
	OS << ", ";
	OS << *Mem.Off;
	OS << ">";
	break;
	case k_RegisterIndex:
	OS << "RegIdx<" << RegIdx.Index << ":" << RegIdx.Kind << ", "
	<< StringRef(RegIdx.Tok.Data, RegIdx.Tok.Length) << ">";
	break;
	case k_Token:
	OS << getToken();
	break;
	case k_RegList:
	OS << "RegList< ";
	for (auto Reg : (*RegList.List))
	OS << Reg << " ";
	OS << ">";
	break;
	}
	}

	bool isValidForTie(const MipsOperand &Other) const {
	if (Kind != Other.Kind)
	return false;

	switch (Kind) {
	default:
	llvm_unreachable("Unexpected kind");
	return false;
	case k_RegisterIndex: {
	StringRef Token(RegIdx.Tok.Data, RegIdx.Tok.Length);
	StringRef OtherToken(Other.RegIdx.Tok.Data, Other.RegIdx.Tok.Length);
	return Token == OtherToken;
	}
	}
	}
	}; // class MipsOperand

	} // end anonymous namespace

	namespace llvm {

	extern const MCInstrDesc MipsInsts[];

	} // end namespace llvm

	static const MCInstrDesc &getInstDesc(unsigned Opcode) {
	return MipsInsts[Opcode];
	}

	static bool hasShortDelaySlot(MCInst &Inst) {
	switch (Inst.getOpcode()) {
	case Mips::BEQ_MM:
	case Mips::BNE_MM:
	case Mips::BLTZ_MM:
	case Mips::BGEZ_MM:
	case Mips::BLEZ_MM:
	case Mips::BGTZ_MM:
	case Mips::JRC16_MM:
	case Mips::JALS_MM:
	case Mips::JALRS_MM:
	case Mips::JALRS16_MM:
	case Mips::BGEZALS_MM:
	case Mips::BLTZALS_MM:
	return true;
	case Mips::J_MM:
	return !Inst.getOperand(0).isReg();
	default:
	return false;
	}
	}

	static const MCSymbol getSingleMCSymbol(const MCExpr Expr) {
	if (const MCSymbolRefExpr *SRExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
	return &SRExpr->getSymbol();
	}

	if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) {
	const MCSymbol *LHSSym = getSingleMCSymbol(BExpr->getLHS());
	const MCSymbol *RHSSym = getSingleMCSymbol(BExpr->getRHS());

	if (LHSSym)
	return LHSSym;

	if (RHSSym)
	return RHSSym;

	return nullptr;
	}

	if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
	return getSingleMCSymbol(UExpr->getSubExpr());

	return nullptr;
	}

	static unsigned countMCSymbolRefExpr(const MCExpr *Expr) {
	if (isa<MCSymbolRefExpr>(Expr))
	return 1;

	if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr))
	return countMCSymbolRefExpr(BExpr->getLHS()) +
	countMCSymbolRefExpr(BExpr->getRHS());

	if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
	return countMCSymbolRefExpr(UExpr->getSubExpr());

	return 0;
	}

	bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
	bool ExpandedJalSym = false;

	Inst.setLoc(IDLoc);

	if (MCID.isBranch() \|\| MCID.isCall()) {
	const unsigned Opcode = Inst.getOpcode();
	MCOperand Offset;

	switch (Opcode) {
	default:
	break;
	case Mips::BBIT0:
	case Mips::BBIT032:
	case Mips::BBIT1:
	case Mips::BBIT132:
	assert(hasCnMips() && "instruction only valid for octeon cpus");
	LLVM_FALLTHROUGH;

	case Mips::BEQ:
	case Mips::BNE:
	case Mips::BEQ_MM:
	case Mips::BNE_MM:
	assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
	Offset = Inst.getOperand(2);
	if (!Offset.isImm())
	break; // We'll deal with this situation later on when applying fixups.
	if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
	return Error(IDLoc, "branch target out of range");
	if (OffsetToAlignment(Offset.getImm(),
	1LL << (inMicroMipsMode() ? 1 : 2)))
	return Error(IDLoc, "branch to misaligned address");
	break;
	case Mips::BGEZ:
	case Mips::BGTZ:
	case Mips::BLEZ:
	case Mips::BLTZ:
	case Mips::BGEZAL:
	case Mips::BLTZAL:
	case Mips::BC1F:
	case Mips::BC1T:
	case Mips::BGEZ_MM:
	case Mips::BGTZ_MM:
	case Mips::BLEZ_MM:
	case Mips::BLTZ_MM:
	case Mips::BGEZAL_MM:
	case Mips::BLTZAL_MM:
	case Mips::BC1F_MM:
	case Mips::BC1T_MM:
	case Mips::BC1EQZC_MMR6:
	case Mips::BC1NEZC_MMR6:
	case Mips::BC2EQZC_MMR6:
	case Mips::BC2NEZC_MMR6:
	assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
	Offset = Inst.getOperand(1);
	if (!Offset.isImm())
	break; // We'll deal with this situation later on when applying fixups.
	if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
	return Error(IDLoc, "branch target out of range");
	if (OffsetToAlignment(Offset.getImm(),
	1LL << (inMicroMipsMode() ? 1 : 2)))
	return Error(IDLoc, "branch to misaligned address");
	break;
	case Mips::BGEC: case Mips::BGEC_MMR6:
	case Mips::BLTC: case Mips::BLTC_MMR6:
	case Mips::BGEUC: case Mips::BGEUC_MMR6:
	case Mips::BLTUC: case Mips::BLTUC_MMR6:
	case Mips::BEQC: case Mips::BEQC_MMR6:
	case Mips::BNEC: case Mips::BNEC_MMR6:
	assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
	Offset = Inst.getOperand(2);
	if (!Offset.isImm())
	break; // We'll deal with this situation later on when applying fixups.
	if (!isIntN(18, Offset.getImm()))
	return Error(IDLoc, "branch target out of range");
	if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
	return Error(IDLoc, "branch to misaligned address");
	break;
	case Mips::BLEZC: case Mips::BLEZC_MMR6:
	case Mips::BGEZC: case Mips::BGEZC_MMR6:
	case Mips::BGTZC: case Mips::BGTZC_MMR6:
	case Mips::BLTZC: case Mips::BLTZC_MMR6:
	assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
	Offset = Inst.getOperand(1);
	if (!Offset.isImm())
	break; // We'll deal with this situation later on when applying fixups.
	if (!isIntN(18, Offset.getImm()))
	return Error(IDLoc, "branch target out of range");
	if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
	return Error(IDLoc, "branch to misaligned address");
	break;
	case Mips::BEQZC: case Mips::BEQZC_MMR6:
	case Mips::BNEZC: case Mips::BNEZC_MMR6:
	assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
	Offset = Inst.getOperand(1);
	if (!Offset.isImm())
	break; // We'll deal with this situation later on when applying fixups.
	if (!isIntN(23, Offset.getImm()))
	return Error(IDLoc, "branch target out of range");
	if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
	return Error(IDLoc, "branch to misaligned address");
	break;
	case Mips::BEQZ16_MM:
	case Mips::BEQZC16_MMR6:
	case Mips::BNEZ16_MM:
	case Mips::BNEZC16_MMR6:
	assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
	Offset = Inst.getOperand(1);
	if (!Offset.isImm())
	break; // We'll deal with this situation later on when applying fixups.
	if (!isInt<8>(Offset.getImm()))
	return Error(IDLoc, "branch target out of range");
	if (OffsetToAlignment(Offset.getImm(), 2LL))
	return Error(IDLoc, "branch to misaligned address");
	break;
	}
	}

	// SSNOP is deprecated on MIPS32r6/MIPS64r6
	// We still accept it but it is a normal nop.
	if (hasMips32r6() && Inst.getOpcode() == Mips::SSNOP) {
	std::string ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
	Warning(IDLoc, "ssnop is deprecated for " + ISA + " and is equivalent to a "
	"nop instruction");
	}

	if (hasCnMips()) {
	const unsigned Opcode = Inst.getOpcode();
	MCOperand Opnd;
	int Imm;

	switch (Opcode) {
	default:
	break;

	case Mips::BBIT0:
	case Mips::BBIT032:
	case Mips::BBIT1:
	case Mips::BBIT132:
	assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
	// The offset is handled above
	Opnd = Inst.getOperand(1);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (Imm < 0 \|\| Imm > (Opcode == Mips::BBIT0 \|\|
	Opcode == Mips::BBIT1 ? 63 : 31))
	return Error(IDLoc, "immediate operand value out of range");
	if (Imm > 31) {
	Inst.setOpcode(Opcode == Mips::BBIT0 ? Mips::BBIT032
	: Mips::BBIT132);
	Inst.getOperand(1).setImm(Imm - 32);
	}
	break;

	case Mips::SEQi:
	case Mips::SNEi:
	assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
	Opnd = Inst.getOperand(2);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (!isInt<10>(Imm))
	return Error(IDLoc, "immediate operand value out of range");
	break;
	}
	}

	// Warn on division by zero. We're checking here as all instructions get
	// processed here, not just the macros that need expansion.
	//
	// The MIPS backend models most of the divison instructions and macros as
	// three operand instructions. The pre-R6 divide instructions however have
	// two operands and explicitly define HI/LO as part of the instruction,
	// not in the operands.
	unsigned FirstOp = 1;
	unsigned SecondOp = 2;
	switch (Inst.getOpcode()) {
	default:
	break;
	case Mips::SDivIMacro:
	case Mips::UDivIMacro:
	case Mips::DSDivIMacro:
	case Mips::DUDivIMacro:
	if (Inst.getOperand(2).getImm() == 0) {
	if (Inst.getOperand(1).getReg() == Mips::ZERO \|\|
	Inst.getOperand(1).getReg() == Mips::ZERO_64)
	Warning(IDLoc, "dividing zero by zero");
	else
	Warning(IDLoc, "division by zero");
	}
	break;
	case Mips::DSDIV:
	case Mips::SDIV:
	case Mips::UDIV:
	case Mips::DUDIV:
	case Mips::UDIV_MM:
	case Mips::SDIV_MM:
	FirstOp = 0;
	SecondOp = 1;
	LLVM_FALLTHROUGH;
	case Mips::SDivMacro:
	case Mips::DSDivMacro:
	case Mips::UDivMacro:
	case Mips::DUDivMacro:
	case Mips::DIV:
	case Mips::DIVU:
	case Mips::DDIV:
	case Mips::DDIVU:
	case Mips::DIVU_MMR6:
	case Mips::DIV_MMR6:
	if (Inst.getOperand(SecondOp).getReg() == Mips::ZERO \|\|
	Inst.getOperand(SecondOp).getReg() == Mips::ZERO_64) {
	if (Inst.getOperand(FirstOp).getReg() == Mips::ZERO \|\|
	Inst.getOperand(FirstOp).getReg() == Mips::ZERO_64)
	Warning(IDLoc, "dividing zero by zero");
	else
	Warning(IDLoc, "division by zero");
	}
	break;
	}

	// For PIC code convert unconditional jump to unconditional branch.
	if ((Inst.getOpcode() == Mips::J \|\| Inst.getOpcode() == Mips::J_MM) &&
	inPicMode()) {
	MCInst BInst;
	BInst.setOpcode(inMicroMipsMode() ? Mips::BEQ_MM : Mips::BEQ);
	BInst.addOperand(MCOperand::createReg(Mips::ZERO));
	BInst.addOperand(MCOperand::createReg(Mips::ZERO));
	BInst.addOperand(Inst.getOperand(0));
	Inst = BInst;
	}

	// This expansion is not in a function called by tryExpandInstruction()
	// because the pseudo-instruction doesn't have a distinct opcode.
	if ((Inst.getOpcode() == Mips::JAL \|\| Inst.getOpcode() == Mips::JAL_MM) &&
	inPicMode()) {
	warnIfNoMacro(IDLoc);

	const MCExpr *JalExpr = Inst.getOperand(0).getExpr();

	// We can do this expansion if there's only 1 symbol in the argument
	// expression.
	if (countMCSymbolRefExpr(JalExpr) > 1)
	return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode");

	// FIXME: This is checking the expression can be handled by the later stages
	// of the assembler. We ought to leave it to those later stages.
	const MCSymbol *JalSym = getSingleMCSymbol(JalExpr);

	// FIXME: Add support for label+offset operands (currently causes an error).
	// FIXME: Add support for forward-declared local symbols.
	// FIXME: Add expansion for when the LargeGOT option is enabled.
	if (JalSym->isInSection() \|\| JalSym->isTemporary() \|\|
	(JalSym->isELF() &&
	cast<MCSymbolELF>(JalSym)->getBinding() == ELF::STB_LOCAL)) {
	if (isABI_O32()) {
	// If it's a local symbol and the O32 ABI is being used, we expand to:
	// lw $25, 0($gp)
	// R_(MICRO)MIPS_GOT16 label
	// addiu $25, $25, 0
	// R_(MICRO)MIPS_LO16 label
	// jalr $25
	const MCExpr *Got16RelocExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_GOT, JalExpr, getContext());
	const MCExpr *Lo16RelocExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_LO, JalExpr, getContext());

	TOut.emitRRX(Mips::LW, Mips::T9, GPReg,
	MCOperand::createExpr(Got16RelocExpr), IDLoc, STI);
	TOut.emitRRX(Mips::ADDiu, Mips::T9, Mips::T9,
	MCOperand::createExpr(Lo16RelocExpr), IDLoc, STI);
	} else if (isABI_N32() \|\| isABI_N64()) {
	// If it's a local symbol and the N32/N64 ABIs are being used,
	// we expand to:
	// lw/ld $25, 0($gp)
	// R_(MICRO)MIPS_GOT_DISP label
	// jalr $25
	const MCExpr *GotDispRelocExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, JalExpr, getContext());

	TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9,
	GPReg, MCOperand::createExpr(GotDispRelocExpr), IDLoc,
	STI);
	}
	} else {
	// If it's an external/weak symbol, we expand to:
	// lw/ld $25, 0($gp)
	// R_(MICRO)MIPS_CALL16 label
	// jalr $25
	const MCExpr *Call16RelocExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, JalExpr, getContext());

	TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, GPReg,
	MCOperand::createExpr(Call16RelocExpr), IDLoc, STI);
	}

	MCInst JalrInst;
	if (IsCpRestoreSet && inMicroMipsMode())
	JalrInst.setOpcode(Mips::JALRS_MM);
	else
	JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
	JalrInst.addOperand(MCOperand::createReg(Mips::RA));
	JalrInst.addOperand(MCOperand::createReg(Mips::T9));

	if (EmitJalrReloc) {
	// As an optimization hint for the linker, before the JALR we add:
	// .reloc tmplabel, R_{MICRO}MIPS_JALR, symbol
	// tmplabel:
	MCSymbol *TmpLabel = getContext().createTempSymbol();
	const MCExpr *TmpExpr = MCSymbolRefExpr::create(TmpLabel, getContext());
	const MCExpr *RelocJalrExpr =
	MCSymbolRefExpr::create(JalSym, MCSymbolRefExpr::VK_None,
	getContext(), IDLoc);

	TOut.getStreamer().EmitRelocDirective(*TmpExpr,
	inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
	RelocJalrExpr, IDLoc, *STI);
	TOut.getStreamer().EmitLabel(TmpLabel);
	}

	Inst = JalrInst;
	ExpandedJalSym = true;
	}

	bool IsPCRelativeLoad = (MCID.TSFlags & MipsII::IsPCRelativeLoad) != 0;
	if ((MCID.mayLoad() \|\| MCID.mayStore()) && !IsPCRelativeLoad) {
	// Check the offset of memory operand, if it is a symbol
	// reference or immediate we may have to expand instructions.
	for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
	const MCOperandInfo &OpInfo = MCID.OpInfo[i];
	if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) \|\|
	(OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
	MCOperand &Op = Inst.getOperand(i);
	if (Op.isImm()) {
	int64_t MemOffset = Op.getImm();
	if (MemOffset < -32768 \|\| MemOffset > 32767) {
	// Offset can't exceed 16bit value.
	expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
	return getParser().hasPendingError();
	}
	} else if (Op.isExpr()) {
	const MCExpr *Expr = Op.getExpr();
	if (Expr->getKind() == MCExpr::SymbolRef) {
	const MCSymbolRefExpr *SR =
	static_cast<const MCSymbolRefExpr *>(Expr);
	if (SR->getKind() == MCSymbolRefExpr::VK_None) {
	// Expand symbol.
	expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
	return getParser().hasPendingError();
	}
	} else if (!isEvaluated(Expr)) {
	expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
	return getParser().hasPendingError();
	}
	}
	}
	} // for
	} // if load/store

	if (inMicroMipsMode()) {
	if (MCID.mayLoad() && Inst.getOpcode() != Mips::LWP_MM) {
	// Try to create 16-bit GP relative load instruction.
	for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
	const MCOperandInfo &OpInfo = MCID.OpInfo[i];
	if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) \|\|
	(OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
	MCOperand &Op = Inst.getOperand(i);
	if (Op.isImm()) {
	int MemOffset = Op.getImm();
	MCOperand &DstReg = Inst.getOperand(0);
	MCOperand &BaseReg = Inst.getOperand(1);
	if (isInt<9>(MemOffset) && (MemOffset % 4 == 0) &&
	getContext().getRegisterInfo()->getRegClass(
	Mips::GPRMM16RegClassID).contains(DstReg.getReg()) &&
	(BaseReg.getReg() == Mips::GP \|\|
	BaseReg.getReg() == Mips::GP_64)) {

	TOut.emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset,
	IDLoc, STI);
	return false;
	}
	}
	}
	} // for
	} // if load

	// TODO: Handle this with the AsmOperandClass.PredicateMethod.

	MCOperand Opnd;
	int Imm;

	switch (Inst.getOpcode()) {
	default:
	break;
	case Mips::ADDIUSP_MM:
	Opnd = Inst.getOperand(0);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (Imm < -1032 \|\| Imm > 1028 \|\| (Imm < 8 && Imm > -12) \|\|
	Imm % 4 != 0)
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::SLL16_MM:
	case Mips::SRL16_MM:
	Opnd = Inst.getOperand(2);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (Imm < 1 \|\| Imm > 8)
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::LI16_MM:
	Opnd = Inst.getOperand(1);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (Imm < -1 \|\| Imm > 126)
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::ADDIUR2_MM:
	Opnd = Inst.getOperand(2);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (!(Imm == 1 \|\| Imm == -1 \|\|
	((Imm % 4 == 0) && Imm < 28 && Imm > 0)))
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::ANDI16_MM:
	Opnd = Inst.getOperand(2);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (!(Imm == 128 \|\| (Imm >= 1 && Imm <= 4) \|\| Imm == 7 \|\| Imm == 8 \|\|
	Imm == 15 \|\| Imm == 16 \|\| Imm == 31 \|\| Imm == 32 \|\| Imm == 63 \|\|
	Imm == 64 \|\| Imm == 255 \|\| Imm == 32768 \|\| Imm == 65535))
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::LBU16_MM:
	Opnd = Inst.getOperand(2);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (Imm < -1 \|\| Imm > 14)
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::SB16_MM:
	case Mips::SB16_MMR6:
	Opnd = Inst.getOperand(2);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (Imm < 0 \|\| Imm > 15)
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::LHU16_MM:
	case Mips::SH16_MM:
	case Mips::SH16_MMR6:
	Opnd = Inst.getOperand(2);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (Imm < 0 \|\| Imm > 30 \|\| (Imm % 2 != 0))
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::LW16_MM:
	case Mips::SW16_MM:
	case Mips::SW16_MMR6:
	Opnd = Inst.getOperand(2);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if (Imm < 0 \|\| Imm > 60 \|\| (Imm % 4 != 0))
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::ADDIUPC_MM:
	Opnd = Inst.getOperand(1);
	if (!Opnd.isImm())
	return Error(IDLoc, "expected immediate operand kind");
	Imm = Opnd.getImm();
	if ((Imm % 4 != 0) \|\| !isInt<25>(Imm))
	return Error(IDLoc, "immediate operand value out of range");
	break;
	case Mips::LWP_MM:
	case Mips::SWP_MM:
	if (Inst.getOperand(0).getReg() == Mips::RA)
	return Error(IDLoc, "invalid operand for instruction");
	break;
	case Mips::MOVEP_MM:
	case Mips::MOVEP_MMR6: {
	unsigned R0 = Inst.getOperand(0).getReg();
	unsigned R1 = Inst.getOperand(1).getReg();
	bool RegPair = ((R0 == Mips::A1 && R1 == Mips::A2) \|\|
	(R0 == Mips::A1 && R1 == Mips::A3) \|\|
	(R0 == Mips::A2 && R1 == Mips::A3) \|\|
	(R0 == Mips::A0 && R1 == Mips::S5) \|\|
	(R0 == Mips::A0 && R1 == Mips::S6) \|\|
	(R0 == Mips::A0 && R1 == Mips::A1) \|\|
	(R0 == Mips::A0 && R1 == Mips::A2) \|\|
	(R0 == Mips::A0 && R1 == Mips::A3));
	if (!RegPair)
	return Error(IDLoc, "invalid operand for instruction");
	break;
	}
	}
	}

	bool FillDelaySlot =
	MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder();
	if (FillDelaySlot)
	TOut.emitDirectiveSetNoReorder();

	MacroExpanderResultTy ExpandResult =
	tryExpandInstruction(Inst, IDLoc, Out, STI);
	switch (ExpandResult) {
	case MER_NotAMacro:
	Out.EmitInstruction(Inst, *STI);
	break;
	case MER_Success:
	break;
	case MER_Fail:
	return true;
	}

	// We know we emitted an instruction on the MER_NotAMacro or MER_Success path.
	// If we're in microMIPS mode then we must also set EF_MIPS_MICROMIPS.
	if (inMicroMipsMode()) {
	TOut.setUsesMicroMips();
	TOut.updateABIInfo(*this);
	}

	// If this instruction has a delay slot and .set reorder is active,
	// emit a NOP after it.
	if (FillDelaySlot) {
	TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst), IDLoc, STI);
	TOut.emitDirectiveSetReorder();
	}

	if ((Inst.getOpcode() == Mips::JalOneReg \|\|
	Inst.getOpcode() == Mips::JalTwoReg \|\| ExpandedJalSym) &&
	isPicAndNotNxxAbi()) {
	if (IsCpRestoreSet) {
	// We need a NOP between the JALR and the LW:
	// If .set reorder has been used, we've already emitted a NOP.
	// If .set noreorder has been used, we need to emit a NOP at this point.
	if (!AssemblerOptions.back()->isReorder())
	TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst), IDLoc,
	STI);

	// Load the $gp from the stack.
	TOut.emitGPRestore(CpRestoreOffset, IDLoc, STI);
	} else
	Warning(IDLoc, "no .cprestore used in PIC mode");
	}

	return false;
	}

	MipsAsmParser::MacroExpanderResultTy
	MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	switch (Inst.getOpcode()) {
	default:
	return MER_NotAMacro;
	case Mips::LoadImm32:
	return expandLoadImm(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::LoadImm64:
	return expandLoadImm(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::LoadAddrImm32:
	case Mips::LoadAddrImm64:
	assert(Inst.getOperand(0).isReg() && "expected register operand kind");
	assert((Inst.getOperand(1).isImm() \|\| Inst.getOperand(1).isExpr()) &&
	"expected immediate operand kind");

	return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister,
	Inst.getOperand(1),
	Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc,
	Out, STI)
	? MER_Fail
	: MER_Success;
	case Mips::LoadAddrReg32:
	case Mips::LoadAddrReg64:
	assert(Inst.getOperand(0).isReg() && "expected register operand kind");
	assert(Inst.getOperand(1).isReg() && "expected register operand kind");
	assert((Inst.getOperand(2).isImm() \|\| Inst.getOperand(2).isExpr()) &&
	"expected immediate operand kind");

	return expandLoadAddress(Inst.getOperand(0).getReg(),
	Inst.getOperand(1).getReg(), Inst.getOperand(2),
	Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc,
	Out, STI)
	? MER_Fail
	: MER_Success;
	case Mips::B_MM_Pseudo:
	case Mips::B_MMR6_Pseudo:
	return expandUncondBranchMMPseudo(Inst, IDLoc, Out, STI) ? MER_Fail
	: MER_Success;
	case Mips::SWM_MM:
	case Mips::LWM_MM:
	return expandLoadStoreMultiple(Inst, IDLoc, Out, STI) ? MER_Fail
	: MER_Success;
	case Mips::JalOneReg:
	case Mips::JalTwoReg:
	return expandJalWithRegs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::BneImm:
	case Mips::BeqImm:
	case Mips::BEQLImmMacro:
	case Mips::BNELImmMacro:
	return expandBranchImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::BLT:
	case Mips::BLE:
	case Mips::BGE:
	case Mips::BGT:
	case Mips::BLTU:
	case Mips::BLEU:
	case Mips::BGEU:
	case Mips::BGTU:
	case Mips::BLTL:
	case Mips::BLEL:
	case Mips::BGEL:
	case Mips::BGTL:
	case Mips::BLTUL:
	case Mips::BLEUL:
	case Mips::BGEUL:
	case Mips::BGTUL:
	case Mips::BLTImmMacro:
	case Mips::BLEImmMacro:
	case Mips::BGEImmMacro:
	case Mips::BGTImmMacro:
	case Mips::BLTUImmMacro:
	case Mips::BLEUImmMacro:
	case Mips::BGEUImmMacro:
	case Mips::BGTUImmMacro:
	case Mips::BLTLImmMacro:
	case Mips::BLELImmMacro:
	case Mips::BGELImmMacro:
	case Mips::BGTLImmMacro:
	case Mips::BLTULImmMacro:
	case Mips::BLEULImmMacro:
	case Mips::BGEULImmMacro:
	case Mips::BGTULImmMacro:
	return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::SDivMacro:
	case Mips::SDivIMacro:
	case Mips::SRemMacro:
	case Mips::SRemIMacro:
	return expandDivRem(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
	: MER_Success;
	case Mips::DSDivMacro:
	case Mips::DSDivIMacro:
	case Mips::DSRemMacro:
	case Mips::DSRemIMacro:
	return expandDivRem(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
	: MER_Success;
	case Mips::UDivMacro:
	case Mips::UDivIMacro:
	case Mips::URemMacro:
	case Mips::URemIMacro:
	return expandDivRem(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
	: MER_Success;
	case Mips::DUDivMacro:
	case Mips::DUDivIMacro:
	case Mips::DURemMacro:
	case Mips::DURemIMacro:
	return expandDivRem(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
	: MER_Success;
	case Mips::PseudoTRUNC_W_S:
	return expandTrunc(Inst, false, false, IDLoc, Out, STI) ? MER_Fail
	: MER_Success;
	case Mips::PseudoTRUNC_W_D32:
	return expandTrunc(Inst, true, false, IDLoc, Out, STI) ? MER_Fail
	: MER_Success;
	case Mips::PseudoTRUNC_W_D:
	return expandTrunc(Inst, true, true, IDLoc, Out, STI) ? MER_Fail
	: MER_Success;

	case Mips::LoadImmSingleGPR:
	return expandLoadImmReal(Inst, true, true, false, IDLoc, Out, STI)
	? MER_Fail
	: MER_Success;
	case Mips::LoadImmSingleFGR:
	return expandLoadImmReal(Inst, true, false, false, IDLoc, Out, STI)
	? MER_Fail
	: MER_Success;
	case Mips::LoadImmDoubleGPR:
	return expandLoadImmReal(Inst, false, true, false, IDLoc, Out, STI)
	? MER_Fail
	: MER_Success;
	case Mips::LoadImmDoubleFGR:
	return expandLoadImmReal(Inst, false, false, true, IDLoc, Out, STI)
	? MER_Fail
	: MER_Success;
	case Mips::LoadImmDoubleFGR_32:
	return expandLoadImmReal(Inst, false, false, false, IDLoc, Out, STI)
	? MER_Fail
	: MER_Success;
	case Mips::Ulh:
	return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::Ulhu:
	return expandUlh(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::Ush:
	return expandUsh(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::Ulw:
	case Mips::Usw:
	return expandUxw(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::NORImm:
	case Mips::NORImm64:
	return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::SGE:
	case Mips::SGEU:
	return expandSge(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::SGEImm:
	case Mips::SGEUImm:
	case Mips::SGEImm64:
	case Mips::SGEUImm64:
	return expandSgeImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::SGTImm:
	case Mips::SGTUImm:
	case Mips::SGTImm64:
	case Mips::SGTUImm64:
	return expandSgtImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::SLTImm64:
	if (isInt<16>(Inst.getOperand(2).getImm())) {
	Inst.setOpcode(Mips::SLTi64);
	return MER_NotAMacro;
	}
	return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::SLTUImm64:
	if (isInt<16>(Inst.getOperand(2).getImm())) {
	Inst.setOpcode(Mips::SLTiu64);
	return MER_NotAMacro;
	}
	return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::ADDi: case Mips::ADDi_MM:
	case Mips::ADDiu: case Mips::ADDiu_MM:
	case Mips::SLTi: case Mips::SLTi_MM:
	case Mips::SLTiu: case Mips::SLTiu_MM:
	if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
	Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
	int64_t ImmValue = Inst.getOperand(2).getImm();
	if (isInt<16>(ImmValue))
	return MER_NotAMacro;
	return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail
	: MER_Success;
	}
	return MER_NotAMacro;
	case Mips::ANDi: case Mips::ANDi_MM: case Mips::ANDi64:
	case Mips::ORi: case Mips::ORi_MM: case Mips::ORi64:
	case Mips::XORi: case Mips::XORi_MM: case Mips::XORi64:
	if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
	Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
	int64_t ImmValue = Inst.getOperand(2).getImm();
	if (isUInt<16>(ImmValue))
	return MER_NotAMacro;
	return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail
	: MER_Success;
	}
	return MER_NotAMacro;
	case Mips::ROL:
	case Mips::ROR:
	return expandRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::ROLImm:
	case Mips::RORImm:
	return expandRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::DROL:
	case Mips::DROR:
	return expandDRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::DROLImm:
	case Mips::DRORImm:
	return expandDRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::ABSMacro:
	return expandAbs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::MULImmMacro:
	case Mips::DMULImmMacro:
	return expandMulImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::MULOMacro:
	case Mips::DMULOMacro:
	return expandMulO(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::MULOUMacro:
	case Mips::DMULOUMacro:
	return expandMulOU(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::DMULMacro:
	return expandDMULMacro(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::LDMacro:
	case Mips::SDMacro:
	return expandLoadStoreDMacro(Inst, IDLoc, Out, STI,
	Inst.getOpcode() == Mips::LDMacro)
	? MER_Fail
	: MER_Success;
	case Mips::SDC1_M1:
	return expandStoreDM1Macro(Inst, IDLoc, Out, STI)
	? MER_Fail
	: MER_Success;
	case Mips::SEQMacro:
	return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::SEQIMacro:
	return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	case Mips::MFTC0: case Mips::MTTC0:
	case Mips::MFTGPR: case Mips::MTTGPR:
	case Mips::MFTLO: case Mips::MTTLO:
	case Mips::MFTHI: case Mips::MTTHI:
	case Mips::MFTACX: case Mips::MTTACX:
	case Mips::MFTDSP: case Mips::MTTDSP:
	case Mips::MFTC1: case Mips::MTTC1:
	case Mips::MFTHC1: case Mips::MTTHC1:
	case Mips::CFTC1: case Mips::CTTC1:
	return expandMXTRAlias(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
	}
	}

	bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	// Create a JALR instruction which is going to replace the pseudo-JAL.
	MCInst JalrInst;
	JalrInst.setLoc(IDLoc);
	const MCOperand FirstRegOp = Inst.getOperand(0);
	const unsigned Opcode = Inst.getOpcode();

	if (Opcode == Mips::JalOneReg) {
	// jal $rs => jalr $rs
	if (IsCpRestoreSet && inMicroMipsMode()) {
	JalrInst.setOpcode(Mips::JALRS16_MM);
	JalrInst.addOperand(FirstRegOp);
	} else if (inMicroMipsMode()) {
	JalrInst.setOpcode(hasMips32r6() ? Mips::JALRC16_MMR6 : Mips::JALR16_MM);
	JalrInst.addOperand(FirstRegOp);
	} else {
	JalrInst.setOpcode(Mips::JALR);
	JalrInst.addOperand(MCOperand::createReg(Mips::RA));
	JalrInst.addOperand(FirstRegOp);
	}
	} else if (Opcode == Mips::JalTwoReg) {
	// jal $rd, $rs => jalr $rd, $rs
	if (IsCpRestoreSet && inMicroMipsMode())
	JalrInst.setOpcode(Mips::JALRS_MM);
	else
	JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
	JalrInst.addOperand(FirstRegOp);
	const MCOperand SecondRegOp = Inst.getOperand(1);
	JalrInst.addOperand(SecondRegOp);
	}
	Out.EmitInstruction(JalrInst, *STI);

	// If .set reorder is active and branch instruction has a delay slot,
	// emit a NOP after it.
	const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode());
	if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
	TOut.emitEmptyDelaySlot(hasShortDelaySlot(JalrInst), IDLoc,
	STI);

	return false;
	}

	/// Can the value be represented by a unsigned N-bit value and a shift left?
	template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) {
	unsigned BitNum = findFirstSet(x);

	return (x == x >> BitNum << BitNum) && isUInt<N>(x >> BitNum);
	}

	/// Load (or add) an immediate into a register.
	///
	/// @param ImmValue The immediate to load.
	/// @param DstReg The register that will hold the immediate.
	/// @param SrcReg A register to add to the immediate or Mips::NoRegister
	/// for a simple initialization.
	/// @param Is32BitImm Is ImmValue 32-bit or 64-bit?
	/// @param IsAddress True if the immediate represents an address. False if it
	/// is an integer.
	/// @param IDLoc Location of the immediate in the source file.
	bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
	unsigned SrcReg, bool Is32BitImm,
	bool IsAddress, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	if (!Is32BitImm && !isGP64bit()) {
	Error(IDLoc, "instruction requires a 64-bit architecture");
	return true;
	}

	if (Is32BitImm) {
	if (isInt<32>(ImmValue) \|\| isUInt<32>(ImmValue)) {
	// Sign extend up to 64-bit so that the predicates match the hardware
	// behaviour. In particular, isInt<16>(0xffff8000) and similar should be
	// true.
	ImmValue = SignExtend64<32>(ImmValue);
	} else {
	Error(IDLoc, "instruction requires a 32-bit immediate");
	return true;
	}
	}

	unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg();
	unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu;

	bool UseSrcReg = false;
	if (SrcReg != Mips::NoRegister)
	UseSrcReg = true;

	unsigned TmpReg = DstReg;
	if (UseSrcReg &&
	getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
	// At this point we need AT to perform the expansions and we exit if it is
	// not available.
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;
	TmpReg = ATReg;
	}

	if (isInt<16>(ImmValue)) {
	if (!UseSrcReg)
	SrcReg = ZeroReg;

	// This doesn't quite follow the usual ABI expectations for N32 but matches
	// traditional assembler behaviour. N32 would normally use addiu for both
	// integers and addresses.
	if (IsAddress && !Is32BitImm) {
	TOut.emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI);
	return false;
	}

	TOut.emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI);
	return false;
	}

	if (isUInt<16>(ImmValue)) {
	unsigned TmpReg = DstReg;
	if (SrcReg == DstReg) {
	TmpReg = getATReg(IDLoc);
	if (!TmpReg)
	return true;
	}

	TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, STI);
	if (UseSrcReg)
	TOut.emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, STI);
	return false;
	}

	if (isInt<32>(ImmValue) \|\| isUInt<32>(ImmValue)) {
	warnIfNoMacro(IDLoc);

	uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
	uint16_t Bits15To0 = ImmValue & 0xffff;
	if (!Is32BitImm && !isInt<32>(ImmValue)) {
	// Traditional behaviour seems to special case this particular value. It's
	// not clear why other masks are handled differently.
	if (ImmValue == 0xffffffff) {
	TOut.emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, STI);
	TOut.emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, STI);
	if (UseSrcReg)
	TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
	return false;
	}

	// Expand to an ORi instead of a LUi to avoid sign-extending into the
	// upper 32 bits.
	TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, STI);
	TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, STI);
	if (Bits15To0)
	TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI);
	if (UseSrcReg)
	TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
	return false;
	}

	TOut.emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, STI);
	if (Bits15To0)
	TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI);
	if (UseSrcReg)
	TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
	return false;
	}

	if (isShiftedUIntAtAnyPosition<16>(ImmValue)) {
	if (Is32BitImm) {
	Error(IDLoc, "instruction requires a 32-bit immediate");
	return true;
	}

	// Traditionally, these immediates are shifted as little as possible and as
	// such we align the most significant bit to bit 15 of our temporary.
	unsigned FirstSet = findFirstSet((uint64_t)ImmValue);
	unsigned LastSet = findLastSet((uint64_t)ImmValue);
	unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet));
	uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff;
	TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, STI);
	TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, STI);

	if (UseSrcReg)
	TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);

	return false;
	}

	warnIfNoMacro(IDLoc);

	// The remaining case is packed with a sequence of dsll and ori with zeros
	// being omitted and any neighbouring dsll's being coalesced.
	// The highest 32-bit's are equivalent to a 32-bit immediate load.

	// Load bits 32-63 of ImmValue into bits 0-31 of the temporary register.
	if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false,
	IDLoc, Out, STI))
	return false;

	// Shift and accumulate into the register. If a 16-bit chunk is zero, then
	// skip it and defer the shift to the next chunk.
	unsigned ShiftCarriedForwards = 16;
	for (int BitNum = 16; BitNum >= 0; BitNum -= 16) {
	uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff;

	if (ImmChunk != 0) {
	TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI);
	TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, STI);
	ShiftCarriedForwards = 0;
	}

	ShiftCarriedForwards += 16;
	}
	ShiftCarriedForwards -= 16;

	// Finish any remaining shifts left by trailing zeros.
	if (ShiftCarriedForwards)
	TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI);

	if (UseSrcReg)
	TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
	MCStreamer &Out, const MCSubtargetInfo *STI) {
	const MCOperand &ImmOp = Inst.getOperand(1);
	assert(ImmOp.isImm() && "expected immediate operand kind");
	const MCOperand &DstRegOp = Inst.getOperand(0);
	assert(DstRegOp.isReg() && "expected register operand kind");

	if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
	Is32BitImm, false, IDLoc, Out, STI))
	return true;

	return false;
	}

	bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
	const MCOperand &Offset,
	bool Is32BitAddress, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	// la can't produce a usable address when addresses are 64-bit.
	if (Is32BitAddress && ABI.ArePtrs64bit()) {
	// FIXME: Demote this to a warning and continue as if we had 'dla' instead.
	// We currently can't do this because we depend on the equality
	// operator and N64 can end up with a GPR32/GPR64 mismatch.
	Error(IDLoc, "la used to load 64-bit address");
	// Continue as if we had 'dla' instead.
	Is32BitAddress = false;
	return true;
	}

	// dla requires 64-bit addresses.
	if (!Is32BitAddress && !hasMips3()) {
	Error(IDLoc, "instruction requires a 64-bit architecture");
	return true;
	}

	if (!Offset.isImm())
	return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg,
	Is32BitAddress, IDLoc, Out, STI);

	if (!ABI.ArePtrs64bit()) {
	// Continue as if we had 'la' whether we had 'la' or 'dla'.
	Is32BitAddress = true;
	}

	return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true,
	IDLoc, Out, STI);
	}

	bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
	unsigned DstReg, unsigned SrcReg,
	bool Is32BitSym, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	// FIXME: These expansions do not respect -mxgot.
	MipsTargetStreamer &TOut = getTargetStreamer();
	bool UseSrcReg = SrcReg != Mips::NoRegister;
	warnIfNoMacro(IDLoc);

	if (inPicMode() && ABI.IsO32()) {
	MCValue Res;
	if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
	Error(IDLoc, "expected relocatable expression");
	return true;
	}
	if (Res.getSymB() != nullptr) {
	Error(IDLoc, "expected relocatable expression with only one symbol");
	return true;
	}

	// The case where the result register is $25 is somewhat special. If the
	// symbol in the final relocation is external and not modified with a
	// constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
	if ((DstReg == Mips::T9 \|\| DstReg == Mips::T9_64) && !UseSrcReg &&
	Res.getConstant() == 0 &&
	!(Res.getSymA()->getSymbol().isInSection() \|\|
	Res.getSymA()->getSymbol().isTemporary() \|\|
	(Res.getSymA()->getSymbol().isELF() &&
	cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
	ELF::STB_LOCAL))) {
	const MCExpr *CallExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
	TOut.emitRRX(Mips::LW, DstReg, GPReg, MCOperand::createExpr(CallExpr),
	IDLoc, STI);
	return false;
	}

	// The remaining cases are:
	// External GOT: lw $tmp, %got(symbol+offset)($gp)
	// >addiu $tmp, $tmp, %lo(offset)
	// >addiu $rd, $tmp, $rs
	// Local GOT: lw $tmp, %got(symbol+offset)($gp)
	// addiu $tmp, $tmp, %lo(symbol+offset)($gp)
	// >addiu $rd, $tmp, $rs
	// The addiu's marked with a '>' may be omitted if they are redundant. If
	// this happens then the last instruction must use $rd as the result
	// register.
	const MipsMCExpr *GotExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
	const MCExpr *LoExpr = nullptr;
	if (Res.getSymA()->getSymbol().isInSection() \|\|
	Res.getSymA()->getSymbol().isTemporary())
	LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
	else if (Res.getConstant() != 0) {
	// External symbols fully resolve the symbol with just the %got(symbol)
	// but we must still account for any offset to the symbol for expressions
	// like symbol+8.
	LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
	}

	unsigned TmpReg = DstReg;
	if (UseSrcReg &&
	getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
	SrcReg)) {
	// If $rs is the same as $rd, we need to use AT.
	// If it is not available we exit.
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;
	TmpReg = ATReg;
	}

	TOut.emitRRX(Mips::LW, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc,
	STI);

	if (LoExpr)
	TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
	IDLoc, STI);

	if (UseSrcReg)
	TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);

	return false;
	}

	if (inPicMode() && ABI.ArePtrs64bit()) {
	MCValue Res;
	if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
	Error(IDLoc, "expected relocatable expression");
	return true;
	}
	if (Res.getSymB() != nullptr) {
	Error(IDLoc, "expected relocatable expression with only one symbol");
	return true;
	}

	// The case where the result register is $25 is somewhat special. If the
	// symbol in the final relocation is external and not modified with a
	// constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP.
	if ((DstReg == Mips::T9 \|\| DstReg == Mips::T9_64) && !UseSrcReg &&
	Res.getConstant() == 0 &&
	!(Res.getSymA()->getSymbol().isInSection() \|\|
	Res.getSymA()->getSymbol().isTemporary() \|\|
	(Res.getSymA()->getSymbol().isELF() &&
	cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
	ELF::STB_LOCAL))) {
	const MCExpr *CallExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
	TOut.emitRRX(Mips::LD, DstReg, GPReg, MCOperand::createExpr(CallExpr),
	IDLoc, STI);
	return false;
	}

	// The remaining cases are:
	// Small offset: ld $tmp, %got_disp(symbol)($gp)
	// >daddiu $tmp, $tmp, offset
	// >daddu $rd, $tmp, $rs
	// The daddiu's marked with a '>' may be omitted if they are redundant. If
	// this happens then the last instruction must use $rd as the result
	// register.
	const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP,
	Res.getSymA(),
	getContext());
	const MCExpr *LoExpr = nullptr;
	if (Res.getConstant() != 0) {
	// Symbols fully resolve with just the %got_disp(symbol) but we
	// must still account for any offset to the symbol for
	// expressions like symbol+8.
	LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());

	// FIXME: Offsets greater than 16 bits are not yet implemented.
	// FIXME: The correct range is a 32-bit sign-extended number.
	if (Res.getConstant() < -0x8000 \|\| Res.getConstant() > 0x7fff) {
	Error(IDLoc, "macro instruction uses large offset, which is not "
	"currently supported");
	return true;
	}
	}

	unsigned TmpReg = DstReg;
	if (UseSrcReg &&
	getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
	SrcReg)) {
	// If $rs is the same as $rd, we need to use AT.
	// If it is not available we exit.
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;
	TmpReg = ATReg;
	}

	TOut.emitRRX(Mips::LD, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc,
	STI);

	if (LoExpr)
	TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
	IDLoc, STI);

	if (UseSrcReg)
	TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);

	return false;
	}

	const MipsMCExpr *HiExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
	const MipsMCExpr *LoExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());

	// This is the 64-bit symbol address expansion.
	if (ABI.ArePtrs64bit() && isGP64bit()) {
	// We need AT for the 64-bit expansion in the cases where the optional
	// source register is the destination register and for the superscalar
	// scheduled form.
	//
	// If it is not available we exit if the destination is the same as the
	// source register.

	const MipsMCExpr *HighestExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
	const MipsMCExpr *HigherExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());

	bool RdRegIsRsReg =
	getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg);

	if (canUseATReg() && UseSrcReg && RdRegIsRsReg) {
	unsigned ATReg = getATReg(IDLoc);

	// If $rs is the same as $rd:
	// (d)la $rd, sym($rd) => lui $at, %highest(sym)
	// daddiu $at, $at, %higher(sym)
	// dsll $at, $at, 16
	// daddiu $at, $at, %hi(sym)
	// dsll $at, $at, 16
	// daddiu $at, $at, %lo(sym)
	// daddu $rd, $at, $rd
	TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
	STI);
	TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
	MCOperand::createExpr(HigherExpr), IDLoc, STI);
	TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
	TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr),
	IDLoc, STI);
	TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
	TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
	IDLoc, STI);
	TOut.emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, STI);

	return false;
	} else if (canUseATReg() && !RdRegIsRsReg) {
	unsigned ATReg = getATReg(IDLoc);

	// If the $rs is different from $rd or if $rs isn't specified and we
	// have $at available:
	// (d)la $rd, sym/sym($rs) => lui $rd, %highest(sym)
	// lui $at, %hi(sym)
	// daddiu $rd, $rd, %higher(sym)
	// daddiu $at, $at, %lo(sym)
	// dsll32 $rd, $rd, 0
	// daddu $rd, $rd, $at
	// (daddu $rd, $rd, $rs)
	//
	// Which is preferred for superscalar issue.
	TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
	STI);
	TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
	TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
	MCOperand::createExpr(HigherExpr), IDLoc, STI);
	TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
	IDLoc, STI);
	TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
	TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
	if (UseSrcReg)
	TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);

	return false;
	} else if (!canUseATReg() && !RdRegIsRsReg) {
	// Otherwise, synthesize the address in the destination register
	// serially:
	// (d)la $rd, sym/sym($rs) => lui $rd, %highest(sym)
	// daddiu $rd, $rd, %higher(sym)
	// dsll $rd, $rd, 16
	// daddiu $rd, $rd, %hi(sym)
	// dsll $rd, $rd, 16
	// daddiu $rd, $rd, %lo(sym)
	TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
	STI);
	TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
	MCOperand::createExpr(HigherExpr), IDLoc, STI);
	TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI);
	TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
	MCOperand::createExpr(HiExpr), IDLoc, STI);
	TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI);
	TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
	MCOperand::createExpr(LoExpr), IDLoc, STI);
	if (UseSrcReg)
	TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);

	return false;
	} else {
	// We have a case where SrcReg == DstReg and we don't have $at
	// available. We can't expand this case, so error out appropriately.
	assert(SrcReg == DstReg && !canUseATReg() &&
	"Could have expanded dla but didn't?");
	reportParseError(IDLoc,
	"pseudo-instruction requires $at, which is not available");
	return true;
	}
	}

	// And now, the 32-bit symbol address expansion:
	// If $rs is the same as $rd:
	// (d)la $rd, sym($rd) => lui $at, %hi(sym)
	// ori $at, $at, %lo(sym)
	// addu $rd, $at, $rd
	// Otherwise, if the $rs is different from $rd or if $rs isn't specified:
	// (d)la $rd, sym/sym($rs) => lui $rd, %hi(sym)
	// ori $rd, $rd, %lo(sym)
	// (addu $rd, $rd, $rs)
	unsigned TmpReg = DstReg;
	if (UseSrcReg &&
	getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
	// If $rs is the same as $rd, we need to use AT.
	// If it is not available we exit.
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;
	TmpReg = ATReg;
	}

	TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
	TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
	IDLoc, STI);

	if (UseSrcReg)
	TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
	else
	assert(
	getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, TmpReg));

	return false;
	}

	// Each double-precision register DO-D15 overlaps with two of the single
	// precision registers F0-F31. As an example, all of the following hold true:
	// D0 + 1 == F1, F1 + 1 == D1, F1 + 1 == F2, depending on the context.
	static unsigned nextReg(unsigned Reg) {
	if (MipsMCRegisterClasses[Mips::FGR32RegClassID].contains(Reg))
	return Reg == (unsigned)Mips::F31 ? (unsigned)Mips::F0 : Reg + 1;
	switch (Reg) {
	default: llvm_unreachable("Unknown register in assembly macro expansion!");
	case Mips::ZERO: return Mips::AT;
	case Mips::AT: return Mips::V0;
	case Mips::V0: return Mips::V1;
	case Mips::V1: return Mips::A0;
	case Mips::A0: return Mips::A1;
	case Mips::A1: return Mips::A2;
	case Mips::A2: return Mips::A3;
	case Mips::A3: return Mips::T0;
	case Mips::T0: return Mips::T1;
	case Mips::T1: return Mips::T2;
	case Mips::T2: return Mips::T3;
	case Mips::T3: return Mips::T4;
	case Mips::T4: return Mips::T5;
	case Mips::T5: return Mips::T6;
	case Mips::T6: return Mips::T7;
	case Mips::T7: return Mips::S0;
	case Mips::S0: return Mips::S1;
	case Mips::S1: return Mips::S2;
	case Mips::S2: return Mips::S3;
	case Mips::S3: return Mips::S4;
	case Mips::S4: return Mips::S5;
	case Mips::S5: return Mips::S6;
	case Mips::S6: return Mips::S7;
	case Mips::S7: return Mips::T8;
	case Mips::T8: return Mips::T9;
	case Mips::T9: return Mips::K0;
	case Mips::K0: return Mips::K1;
	case Mips::K1: return Mips::GP;
	case Mips::GP: return Mips::SP;
	case Mips::SP: return Mips::FP;
	case Mips::FP: return Mips::RA;
	case Mips::RA: return Mips::ZERO;
	case Mips::D0: return Mips::F1;
	case Mips::D1: return Mips::F3;
	case Mips::D2: return Mips::F5;
	case Mips::D3: return Mips::F7;
	case Mips::D4: return Mips::F9;
	case Mips::D5: return Mips::F11;
	case Mips::D6: return Mips::F13;
	case Mips::D7: return Mips::F15;
	case Mips::D8: return Mips::F17;
	case Mips::D9: return Mips::F19;
	case Mips::D10: return Mips::F21;
	case Mips::D11: return Mips::F23;
	case Mips::D12: return Mips::F25;
	case Mips::D13: return Mips::F27;
	case Mips::D14: return Mips::F29;
	case Mips::D15: return Mips::F31;
	}
	}

	// FIXME: This method is too general. In principle we should compute the number
	// of instructions required to synthesize the immediate inline compared to
	// synthesizing the address inline and relying on non .text sections.
	// For static O32 and N32 this may yield a small benefit, for static N64 this is
	// likely to yield a much larger benefit as we have to synthesize a 64bit
	// address to load a 64 bit value.
	bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
	MCSymbol *Sym) {
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	if(IsPicEnabled) {
	const MCExpr *GotSym =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	const MipsMCExpr *GotExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_GOT, GotSym, getContext());

	if(isABI_O32() \|\| isABI_N32()) {
	TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
	IDLoc, STI);
	} else { //isABI_N64()
	TOut.emitRRX(Mips::LD, ATReg, GPReg, MCOperand::createExpr(GotExpr),
	IDLoc, STI);
	}
	} else { //!IsPicEnabled
	const MCExpr *HiSym =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	const MipsMCExpr *HiExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_HI, HiSym, getContext());

	// FIXME: This is technically correct but gives a different result to gas,
	// but gas is incomplete there (it has a fixme noting it doesn't work with
	// 64-bit addresses).
	// FIXME: With -msym32 option, the address expansion for N64 should probably
	// use the O32 / N32 case. It's safe to use the 64 address expansion as the
	// symbol's value is considered sign extended.
	if(isABI_O32() \|\| isABI_N32()) {
	TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
	} else { //isABI_N64()
	const MCExpr *HighestSym =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	const MipsMCExpr *HighestExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, HighestSym, getContext());
	const MCExpr *HigherSym =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	const MipsMCExpr *HigherExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, HigherSym, getContext());

	TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
	STI);
	TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
	MCOperand::createExpr(HigherExpr), IDLoc, STI);
	TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
	TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr),
	IDLoc, STI);
	TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
	}
	}
	return false;
	}

	bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR,
	bool Is64FPU, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	assert(Inst.getNumOperands() == 2 && "Invalid operand count");
	assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
	"Invalid instruction operand.");

	unsigned FirstReg = Inst.getOperand(0).getReg();
	uint64_t ImmOp64 = Inst.getOperand(1).getImm();

	uint32_t HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
	// If ImmOp64 is AsmToken::Integer type (all bits set to zero in the
	// exponent field), convert it to double (e.g. 1 to 1.0)
	if ((HiImmOp64 & 0x7ff00000) == 0) {
	APFloat RealVal(APFloat::IEEEdouble(), ImmOp64);
	ImmOp64 = RealVal.bitcastToAPInt().getZExtValue();
	}

	uint32_t LoImmOp64 = ImmOp64 & 0xffffffff;
	HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;

	if (IsSingle) {
	// Conversion of a double in an uint64_t to a float in a uint32_t,
	// retaining the bit pattern of a float.
	uint32_t ImmOp32;
	double doubleImm = BitsToDouble(ImmOp64);
	float tmp_float = static_cast<float>(doubleImm);
	ImmOp32 = FloatToBits(tmp_float);

	if (IsGPR) {
	if (loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, true, IDLoc,
	Out, STI))
	return true;
	return false;
	} else {
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;
	if (LoImmOp64 == 0) {
	if (loadImmediate(ImmOp32, ATReg, Mips::NoRegister, true, true, IDLoc,
	Out, STI))
	return true;
	TOut.emitRR(Mips::MTC1, FirstReg, ATReg, IDLoc, STI);
	return false;
	}

	MCSection *CS = getStreamer().getCurrentSectionOnly();
	// FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
	// where appropriate.
	MCSection *ReadOnlySection = getContext().getELFSection(
	".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);

	MCSymbol *Sym = getContext().createTempSymbol();
	const MCExpr *LoSym =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	const MipsMCExpr *LoExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());

	getStreamer().SwitchSection(ReadOnlySection);
	getStreamer().EmitLabel(Sym, IDLoc);
	getStreamer().EmitIntValue(ImmOp32, 4);
	getStreamer().SwitchSection(CS);

	if(emitPartialAddress(TOut, IDLoc, Sym))
	return true;
	TOut.emitRRX(Mips::LWC1, FirstReg, ATReg,
	MCOperand::createExpr(LoExpr), IDLoc, STI);
	}
	return false;
	}

	// if(!IsSingle)
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	if (IsGPR) {
	if (LoImmOp64 == 0) {
	if(isABI_N32() \|\| isABI_N64()) {
	if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, false, true,
	IDLoc, Out, STI))
	return true;
	return false;
	} else {
	if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, true, true,
	IDLoc, Out, STI))
	return true;

	if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, true,
	IDLoc, Out, STI))
	return true;
	return false;
	}
	}

	MCSection *CS = getStreamer().getCurrentSectionOnly();
	MCSection *ReadOnlySection = getContext().getELFSection(
	".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);

	MCSymbol *Sym = getContext().createTempSymbol();
	const MCExpr *LoSym =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	const MipsMCExpr *LoExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());

	getStreamer().SwitchSection(ReadOnlySection);
	getStreamer().EmitLabel(Sym, IDLoc);
	getStreamer().EmitIntValue(HiImmOp64, 4);
	getStreamer().EmitIntValue(LoImmOp64, 4);
	getStreamer().SwitchSection(CS);

	if(emitPartialAddress(TOut, IDLoc, Sym))
	return true;
	if(isABI_N64())
	TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
	MCOperand::createExpr(LoExpr), IDLoc, STI);
	else
	TOut.emitRRX(Mips::ADDiu, ATReg, ATReg,
	MCOperand::createExpr(LoExpr), IDLoc, STI);

	if(isABI_N32() \|\| isABI_N64())
	TOut.emitRRI(Mips::LD, FirstReg, ATReg, 0, IDLoc, STI);
	else {
	TOut.emitRRI(Mips::LW, FirstReg, ATReg, 0, IDLoc, STI);
	TOut.emitRRI(Mips::LW, nextReg(FirstReg), ATReg, 4, IDLoc, STI);
	}
	return false;
	} else { // if(!IsGPR && !IsSingle)
	if ((LoImmOp64 == 0) &&
	!((HiImmOp64 & 0xffff0000) && (HiImmOp64 & 0x0000ffff))) {
	// FIXME: In the case where the constant is zero, we can load the
	// register directly from the zero register.
	if (loadImmediate(HiImmOp64, ATReg, Mips::NoRegister, true, true, IDLoc,
	Out, STI))
	return true;
	if (isABI_N32() \|\| isABI_N64())
	TOut.emitRR(Mips::DMTC1, FirstReg, ATReg, IDLoc, STI);
	else if (hasMips32r2()) {
	TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
	TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, ATReg, IDLoc, STI);
	} else {
	TOut.emitRR(Mips::MTC1, nextReg(FirstReg), ATReg, IDLoc, STI);
	TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
	}
	return false;
	}

	MCSection *CS = getStreamer().getCurrentSectionOnly();
	// FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
	// where appropriate.
	MCSection *ReadOnlySection = getContext().getELFSection(
	".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);

	MCSymbol *Sym = getContext().createTempSymbol();
	const MCExpr *LoSym =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	const MipsMCExpr *LoExpr =
	MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());

	getStreamer().SwitchSection(ReadOnlySection);
	getStreamer().EmitLabel(Sym, IDLoc);
	getStreamer().EmitIntValue(HiImmOp64, 4);
	getStreamer().EmitIntValue(LoImmOp64, 4);
	getStreamer().SwitchSection(CS);

	if(emitPartialAddress(TOut, IDLoc, Sym))
	return true;
	TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, ATReg,
	MCOperand::createExpr(LoExpr), IDLoc, STI);
	}
	return false;
	}

	bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	assert(getInstDesc(Inst.getOpcode()).getNumOperands() == 1 &&
	"unexpected number of operands");

	MCOperand Offset = Inst.getOperand(0);
	if (Offset.isExpr()) {
	Inst.clear();
	Inst.setOpcode(Mips::BEQ_MM);
	Inst.addOperand(MCOperand::createReg(Mips::ZERO));
	Inst.addOperand(MCOperand::createReg(Mips::ZERO));
	Inst.addOperand(MCOperand::createExpr(Offset.getExpr()));
	} else {
	assert(Offset.isImm() && "expected immediate operand kind");
	if (isInt<11>(Offset.getImm())) {
	// If offset fits into 11 bits then this instruction becomes microMIPS
	// 16-bit unconditional branch instruction.
	if (inMicroMipsMode())
	Inst.setOpcode(hasMips32r6() ? Mips::BC16_MMR6 : Mips::B16_MM);
	} else {
	if (!isInt<17>(Offset.getImm()))
	return Error(IDLoc, "branch target out of range");
	if (OffsetToAlignment(Offset.getImm(), 1LL << 1))
	return Error(IDLoc, "branch to misaligned address");
	Inst.clear();
	Inst.setOpcode(Mips::BEQ_MM);
	Inst.addOperand(MCOperand::createReg(Mips::ZERO));
	Inst.addOperand(MCOperand::createReg(Mips::ZERO));
	Inst.addOperand(MCOperand::createImm(Offset.getImm()));
	}
	}
	Out.EmitInstruction(Inst, *STI);

	// If .set reorder is active and branch instruction has a delay slot,
	// emit a NOP after it.
	const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
	if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
	TOut.emitEmptyDelaySlot(true, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	const MCOperand &DstRegOp = Inst.getOperand(0);
	assert(DstRegOp.isReg() && "expected register operand kind");

	const MCOperand &ImmOp = Inst.getOperand(1);
	assert(ImmOp.isImm() && "expected immediate operand kind");

	const MCOperand &MemOffsetOp = Inst.getOperand(2);
	assert((MemOffsetOp.isImm() \|\| MemOffsetOp.isExpr()) &&
	"expected immediate or expression operand");

	bool IsLikely = false;

	unsigned OpCode = 0;
	switch(Inst.getOpcode()) {
	case Mips::BneImm:
	OpCode = Mips::BNE;
	break;
	case Mips::BeqImm:
	OpCode = Mips::BEQ;
	break;
	case Mips::BEQLImmMacro:
	OpCode = Mips::BEQL;
	IsLikely = true;
	break;
	case Mips::BNELImmMacro:
	OpCode = Mips::BNEL;
	IsLikely = true;
	break;
	default:
	llvm_unreachable("Unknown immediate branch pseudo-instruction.");
	break;
	}

	int64_t ImmValue = ImmOp.getImm();
	if (ImmValue == 0) {
	if (IsLikely) {
	TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO,
	MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
	TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
	} else
	TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
	STI);
	} else {
	warnIfNoMacro(IDLoc);

	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true,
	IDLoc, Out, STI))
	return true;

	if (IsLikely) {
	TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg,
	MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
	TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
	} else
	TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
	}
	return false;
	}

	void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI, bool IsLoad) {
	const MCOperand &DstRegOp = Inst.getOperand(0);
	assert(DstRegOp.isReg() && "expected register operand kind");
	const MCOperand &BaseRegOp = Inst.getOperand(1);
	assert(BaseRegOp.isReg() && "expected register operand kind");
	const MCOperand &OffsetOp = Inst.getOperand(2);

	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned DstReg = DstRegOp.getReg();
	unsigned BaseReg = BaseRegOp.getReg();
	unsigned TmpReg = DstReg;

	const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
	int16_t DstRegClass = Desc.OpInfo[0].RegClass;
	unsigned DstRegClassID =
	getContext().getRegisterInfo()->getRegClass(DstRegClass).getID();
	bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) \|\|
	(DstRegClassID == Mips::GPR64RegClassID);

	if (!IsLoad \|\| !IsGPR \|\| (BaseReg == DstReg)) {
	// At this point we need AT to perform the expansions
	// and we exit if it is not available.
	TmpReg = getATReg(IDLoc);
	if (!TmpReg)
	return;
	}

	if (OffsetOp.isImm()) {
	int64_t LoOffset = OffsetOp.getImm() & 0xffff;
	int64_t HiOffset = OffsetOp.getImm() & ~0xffff;

	// If msb of LoOffset is 1(negative number) we must increment
	// HiOffset to account for the sign-extension of the low part.
	if (LoOffset & 0x8000)
	HiOffset += 0x10000;

	bool IsLargeOffset = HiOffset != 0;

	if (IsLargeOffset) {
	bool Is32BitImm = (HiOffset >> 32) == 0;
	if (loadImmediate(HiOffset, TmpReg, Mips::NoRegister, Is32BitImm, true,
	IDLoc, Out, STI))
	return;
	}

	if (BaseReg != Mips::ZERO && BaseReg != Mips::ZERO_64)
	TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg,
	BaseReg, IDLoc, STI);
	TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, LoOffset, IDLoc, STI);
	+ return;
	+ }
	+
	+ assert(OffsetOp.isExpr() && "expected expression operand kind");
	+ if (inPicMode()) {
	+ // FIXME:
	+ // a) Fix lw/sw $reg, symbol($reg) instruction expanding.
	+ // b) If expression includes offset (sym + number), do not
	+ // encode the offset into a relocation. Take it in account
	+ // in the last load/store instruction.
	+ // c) Check that immediates of R_MIPS_GOT16/R_MIPS_LO16 relocations
	+ // do not exceed 16-bit.
	+ // d) Use R_MIPS_GOT_PAGE/R_MIPS_GOT_OFST relocations instead
	+ // of R_MIPS_GOT_DISP in appropriate cases to reduce number
	+ // of GOT entries.
	+ expandLoadAddress(TmpReg, Mips::NoRegister, OffsetOp, !ABI.ArePtrs64bit(),
	+ IDLoc, Out, STI);
	+ TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, 0, IDLoc, STI);
	} else {
	- assert(OffsetOp.isExpr() && "expected expression operand kind");
	const MCExpr *ExprOffset = OffsetOp.getExpr();
	MCOperand LoOperand = MCOperand::createExpr(
	MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
	MCOperand HiOperand = MCOperand::createExpr(
	MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));

	if (IsLoad)
	TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
	LoOperand, TmpReg, IDLoc, STI);
	else
	TOut.emitStoreWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
	LoOperand, TmpReg, IDLoc, STI);
	}
	}

	bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	unsigned OpNum = Inst.getNumOperands();
	unsigned Opcode = Inst.getOpcode();
	unsigned NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM32_MM : Mips::LWM32_MM;

	assert(Inst.getOperand(OpNum - 1).isImm() &&
	Inst.getOperand(OpNum - 2).isReg() &&
	Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand.");

	if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 &&
	Inst.getOperand(OpNum - 1).getImm() >= 0 &&
	(Inst.getOperand(OpNum - 2).getReg() == Mips::SP \|\|
	Inst.getOperand(OpNum - 2).getReg() == Mips::SP_64) &&
	(Inst.getOperand(OpNum - 3).getReg() == Mips::RA \|\|
	Inst.getOperand(OpNum - 3).getReg() == Mips::RA_64)) {
	// It can be implemented as SWM16 or LWM16 instruction.
	if (inMicroMipsMode() && hasMips32r6())
	NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MMR6 : Mips::LWM16_MMR6;
	else
	NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM;
	}

	Inst.setOpcode(NewOpcode);
	Out.EmitInstruction(Inst, *STI);
	return false;
	}

	bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	bool EmittedNoMacroWarning = false;
	unsigned PseudoOpcode = Inst.getOpcode();
	unsigned SrcReg = Inst.getOperand(0).getReg();
	const MCOperand &TrgOp = Inst.getOperand(1);
	const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr();

	unsigned ZeroSrcOpcode, ZeroTrgOpcode;
	bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality;

	unsigned TrgReg;
	if (TrgOp.isReg())
	TrgReg = TrgOp.getReg();
	else if (TrgOp.isImm()) {
	warnIfNoMacro(IDLoc);
	EmittedNoMacroWarning = true;

	TrgReg = getATReg(IDLoc);
	if (!TrgReg)
	return true;

	switch(PseudoOpcode) {
	default:
	llvm_unreachable("unknown opcode for branch pseudo-instruction");
	case Mips::BLTImmMacro:
	PseudoOpcode = Mips::BLT;
	break;
	case Mips::BLEImmMacro:
	PseudoOpcode = Mips::BLE;
	break;
	case Mips::BGEImmMacro:
	PseudoOpcode = Mips::BGE;
	break;
	case Mips::BGTImmMacro:
	PseudoOpcode = Mips::BGT;
	break;
	case Mips::BLTUImmMacro:
	PseudoOpcode = Mips::BLTU;
	break;
	case Mips::BLEUImmMacro:
	PseudoOpcode = Mips::BLEU;
	break;
	case Mips::BGEUImmMacro:
	PseudoOpcode = Mips::BGEU;
	break;
	case Mips::BGTUImmMacro:
	PseudoOpcode = Mips::BGTU;
	break;
	case Mips::BLTLImmMacro:
	PseudoOpcode = Mips::BLTL;
	break;
	case Mips::BLELImmMacro:
	PseudoOpcode = Mips::BLEL;
	break;
	case Mips::BGELImmMacro:
	PseudoOpcode = Mips::BGEL;
	break;
	case Mips::BGTLImmMacro:
	PseudoOpcode = Mips::BGTL;
	break;
	case Mips::BLTULImmMacro:
	PseudoOpcode = Mips::BLTUL;
	break;
	case Mips::BLEULImmMacro:
	PseudoOpcode = Mips::BLEUL;
	break;
	case Mips::BGEULImmMacro:
	PseudoOpcode = Mips::BGEUL;
	break;
	case Mips::BGTULImmMacro:
	PseudoOpcode = Mips::BGTUL;
	break;
	}

	if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(),
	false, IDLoc, Out, STI))
	return true;
	}

	switch (PseudoOpcode) {
	case Mips::BLT:
	case Mips::BLTU:
	case Mips::BLTL:
	case Mips::BLTUL:
	AcceptsEquality = false;
	ReverseOrderSLT = false;
	IsUnsigned =
	((PseudoOpcode == Mips::BLTU) \|\| (PseudoOpcode == Mips::BLTUL));
	IsLikely = ((PseudoOpcode == Mips::BLTL) \|\| (PseudoOpcode == Mips::BLTUL));
	ZeroSrcOpcode = Mips::BGTZ;
	ZeroTrgOpcode = Mips::BLTZ;
	break;
	case Mips::BLE:
	case Mips::BLEU:
	case Mips::BLEL:
	case Mips::BLEUL:
	AcceptsEquality = true;
	ReverseOrderSLT = true;
	IsUnsigned =
	((PseudoOpcode == Mips::BLEU) \|\| (PseudoOpcode == Mips::BLEUL));
	IsLikely = ((PseudoOpcode == Mips::BLEL) \|\| (PseudoOpcode == Mips::BLEUL));
	ZeroSrcOpcode = Mips::BGEZ;
	ZeroTrgOpcode = Mips::BLEZ;
	break;
	case Mips::BGE:
	case Mips::BGEU:
	case Mips::BGEL:
	case Mips::BGEUL:
	AcceptsEquality = true;
	ReverseOrderSLT = false;
	IsUnsigned =
	((PseudoOpcode == Mips::BGEU) \|\| (PseudoOpcode == Mips::BGEUL));
	IsLikely = ((PseudoOpcode == Mips::BGEL) \|\| (PseudoOpcode == Mips::BGEUL));
	ZeroSrcOpcode = Mips::BLEZ;
	ZeroTrgOpcode = Mips::BGEZ;
	break;
	case Mips::BGT:
	case Mips::BGTU:
	case Mips::BGTL:
	case Mips::BGTUL:
	AcceptsEquality = false;
	ReverseOrderSLT = true;
	IsUnsigned =
	((PseudoOpcode == Mips::BGTU) \|\| (PseudoOpcode == Mips::BGTUL));
	IsLikely = ((PseudoOpcode == Mips::BGTL) \|\| (PseudoOpcode == Mips::BGTUL));
	ZeroSrcOpcode = Mips::BLTZ;
	ZeroTrgOpcode = Mips::BGTZ;
	break;
	default:
	llvm_unreachable("unknown opcode for branch pseudo-instruction");
	}

	bool IsTrgRegZero = (TrgReg == Mips::ZERO);
	bool IsSrcRegZero = (SrcReg == Mips::ZERO);
	if (IsSrcRegZero && IsTrgRegZero) {
	// FIXME: All of these Opcode-specific if's are needed for compatibility
	// with GAS' behaviour. However, they may not generate the most efficient
	// code in some circumstances.
	if (PseudoOpcode == Mips::BLT) {
	TOut.emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
	IDLoc, STI);
	return false;
	}
	if (PseudoOpcode == Mips::BLE) {
	TOut.emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
	IDLoc, STI);
	Warning(IDLoc, "branch is always taken");
	return false;
	}
	if (PseudoOpcode == Mips::BGE) {
	TOut.emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
	IDLoc, STI);
	Warning(IDLoc, "branch is always taken");
	return false;
	}
	if (PseudoOpcode == Mips::BGT) {
	TOut.emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
	IDLoc, STI);
	return false;
	}
	if (PseudoOpcode == Mips::BGTU) {
	TOut.emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO,
	MCOperand::createExpr(OffsetExpr), IDLoc, STI);
	return false;
	}
	if (AcceptsEquality) {
	// If both registers are $0 and the pseudo-branch accepts equality, it
	// will always be taken, so we emit an unconditional branch.
	TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
	MCOperand::createExpr(OffsetExpr), IDLoc, STI);
	Warning(IDLoc, "branch is always taken");
	return false;
	}
	// If both registers are $0 and the pseudo-branch does not accept
	// equality, it will never be taken, so we don't have to emit anything.
	return false;
	}
	if (IsSrcRegZero \|\| IsTrgRegZero) {
	if ((IsSrcRegZero && PseudoOpcode == Mips::BGTU) \|\|
	(IsTrgRegZero && PseudoOpcode == Mips::BLTU)) {
	// If the $rs is $0 and the pseudo-branch is BGTU (0 > x) or
	// if the $rt is $0 and the pseudo-branch is BLTU (x < 0),
	// the pseudo-branch will never be taken, so we don't emit anything.
	// This only applies to unsigned pseudo-branches.
	return false;
	}
	if ((IsSrcRegZero && PseudoOpcode == Mips::BLEU) \|\|
	(IsTrgRegZero && PseudoOpcode == Mips::BGEU)) {
	// If the $rs is $0 and the pseudo-branch is BLEU (0 <= x) or
	// if the $rt is $0 and the pseudo-branch is BGEU (x >= 0),
	// the pseudo-branch will always be taken, so we emit an unconditional
	// branch.
	// This only applies to unsigned pseudo-branches.
	TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
	MCOperand::createExpr(OffsetExpr), IDLoc, STI);
	Warning(IDLoc, "branch is always taken");
	return false;
	}
	if (IsUnsigned) {
	// If the $rs is $0 and the pseudo-branch is BLTU (0 < x) or
	// if the $rt is $0 and the pseudo-branch is BGTU (x > 0),
	// the pseudo-branch will be taken only when the non-zero register is
	// different from 0, so we emit a BNEZ.
	//
	// If the $rs is $0 and the pseudo-branch is BGEU (0 >= x) or
	// if the $rt is $0 and the pseudo-branch is BLEU (x <= 0),
	// the pseudo-branch will be taken only when the non-zero register is
	// equal to 0, so we emit a BEQZ.
	//
	// Because only BLEU and BGEU branch on equality, we can use the
	// AcceptsEquality variable to decide when to emit the BEQZ.
	TOut.emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE,
	IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO,
	MCOperand::createExpr(OffsetExpr), IDLoc, STI);
	return false;
	}
	// If we have a signed pseudo-branch and one of the registers is $0,
	// we can use an appropriate compare-to-zero branch. We select which one
	// to use in the switch statement above.
	TOut.emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode,
	IsSrcRegZero ? TrgReg : SrcReg,
	MCOperand::createExpr(OffsetExpr), IDLoc, STI);
	return false;
	}

	// If neither the SrcReg nor the TrgReg are $0, we need AT to perform the
	// expansions. If it is not available, we return.
	unsigned ATRegNum = getATReg(IDLoc);
	if (!ATRegNum)
	return true;

	if (!EmittedNoMacroWarning)
	warnIfNoMacro(IDLoc);

	// SLT fits well with 2 of our 4 pseudo-branches:
	// BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and
	// BGT, where $rs > $rt, translates into "slt $at, $rt, $rs".
	// If the result of the SLT is 1, we branch, and if it's 0, we don't.
	// This is accomplished by using a BNEZ with the result of the SLT.
	//
	// The other 2 pseudo-branches are opposites of the above 2 (BGE with BLT
	// and BLE with BGT), so we change the BNEZ into a BEQZ.
	// Because only BGE and BLE branch on equality, we can use the
	// AcceptsEquality variable to decide when to emit the BEQZ.
	// Note that the order of the SLT arguments doesn't change between
	// opposites.
	//
	// The same applies to the unsigned variants, except that SLTu is used
	// instead of SLT.
	TOut.emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum,
	ReverseOrderSLT ? TrgReg : SrcReg,
	ReverseOrderSLT ? SrcReg : TrgReg, IDLoc, STI);

	TOut.emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL)
	: (AcceptsEquality ? Mips::BEQ : Mips::BNE),
	ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
	STI);
	return false;
	}

	// Expand a integer division macro.
	//
	// Notably we don't have to emit a warning when encountering $rt as the $zero
	// register, or 0 as an immediate. processInstruction() has already done that.
	//
	// The destination register can only be $zero when expanding (S)DivIMacro or
	// D(S)DivMacro.

	bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI, const bool IsMips64,
	const bool Signed) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	warnIfNoMacro(IDLoc);

	const MCOperand &RdRegOp = Inst.getOperand(0);
	assert(RdRegOp.isReg() && "expected register operand kind");
	unsigned RdReg = RdRegOp.getReg();

	const MCOperand &RsRegOp = Inst.getOperand(1);
	assert(RsRegOp.isReg() && "expected register operand kind");
	unsigned RsReg = RsRegOp.getReg();

	unsigned RtReg;
	int64_t ImmValue;

	const MCOperand &RtOp = Inst.getOperand(2);
	assert((RtOp.isReg() \|\| RtOp.isImm()) &&
	"expected register or immediate operand kind");
	if (RtOp.isReg())
	RtReg = RtOp.getReg();
	else
	ImmValue = RtOp.getImm();

	unsigned DivOp;
	unsigned ZeroReg;
	unsigned SubOp;

	if (IsMips64) {
	DivOp = Signed ? Mips::DSDIV : Mips::DUDIV;
	ZeroReg = Mips::ZERO_64;
	SubOp = Mips::DSUB;
	} else {
	DivOp = Signed ? Mips::SDIV : Mips::UDIV;
	ZeroReg = Mips::ZERO;
	SubOp = Mips::SUB;
	}

	bool UseTraps = useTraps();

	unsigned Opcode = Inst.getOpcode();
	bool isDiv = Opcode == Mips::SDivMacro \|\| Opcode == Mips::SDivIMacro \|\|
	Opcode == Mips::UDivMacro \|\| Opcode == Mips::UDivIMacro \|\|
	Opcode == Mips::DSDivMacro \|\| Opcode == Mips::DSDivIMacro \|\|
	Opcode == Mips::DUDivMacro \|\| Opcode == Mips::DUDivIMacro;

	bool isRem = Opcode == Mips::SRemMacro \|\| Opcode == Mips::SRemIMacro \|\|
	Opcode == Mips::URemMacro \|\| Opcode == Mips::URemIMacro \|\|
	Opcode == Mips::DSRemMacro \|\| Opcode == Mips::DSRemIMacro \|\|
	Opcode == Mips::DURemMacro \|\| Opcode == Mips::DURemIMacro;

	if (RtOp.isImm()) {
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	if (ImmValue == 0) {
	if (UseTraps)
	TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
	else
	TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
	return false;
	}

	if (isRem && (ImmValue == 1 \|\| (Signed && (ImmValue == -1)))) {
	TOut.emitRRR(Mips::OR, RdReg, ZeroReg, ZeroReg, IDLoc, STI);
	return false;
	} else if (isDiv && ImmValue == 1) {
	TOut.emitRRR(Mips::OR, RdReg, RsReg, Mips::ZERO, IDLoc, STI);
	return false;
	} else if (isDiv && Signed && ImmValue == -1) {
	TOut.emitRRR(SubOp, RdReg, ZeroReg, RsReg, IDLoc, STI);
	return false;
	} else {
	if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue),
	false, Inst.getLoc(), Out, STI))
	return true;
	TOut.emitRR(DivOp, RsReg, ATReg, IDLoc, STI);
	TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
	return false;
	}
	return true;
	}

	// If the macro expansion of (d)div(u) or (d)rem(u) would always trap or
	// break, insert the trap/break and exit. This gives a different result to
	// GAS. GAS has an inconsistency/missed optimization in that not all cases
	// are handled equivalently. As the observed behaviour is the same, we're ok.
	if (RtReg == Mips::ZERO \|\| RtReg == Mips::ZERO_64) {
	if (UseTraps) {
	TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
	return false;
	}
	TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
	return false;
	}

	// (d)rem(u) $0, $X, $Y is a special case. Like div $zero, $X, $Y, it does
	// not expand to macro sequence.
	if (isRem && (RdReg == Mips::ZERO \|\| RdReg == Mips::ZERO_64)) {
	TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
	return false;
	}

	// Temporary label for first branch traget
	MCContext &Context = TOut.getStreamer().getContext();
	MCSymbol *BrTarget;
	MCOperand LabelOp;

	if (UseTraps) {
	TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
	} else {
	// Branch to the li instruction.
	BrTarget = Context.createTempSymbol();
	LabelOp = MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
	TOut.emitRRX(Mips::BNE, RtReg, ZeroReg, LabelOp, IDLoc, STI);
	}

	TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);

	if (!UseTraps)
	TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);

	if (!Signed) {
	if (!UseTraps)
	TOut.getStreamer().EmitLabel(BrTarget);

	TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
	return false;
	}

	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	if (!UseTraps)
	TOut.getStreamer().EmitLabel(BrTarget);

	TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI);

	// Temporary label for the second branch target.
	MCSymbol *BrTargetEnd = Context.createTempSymbol();
	MCOperand LabelOpEnd =
	MCOperand::createExpr(MCSymbolRefExpr::create(BrTargetEnd, Context));

	// Branch to the mflo instruction.
	TOut.emitRRX(Mips::BNE, RtReg, ATReg, LabelOpEnd, IDLoc, STI);

	if (IsMips64) {
	TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
	TOut.emitDSLL(ATReg, ATReg, 63, IDLoc, STI);
	} else {
	TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
	}

	if (UseTraps)
	TOut.emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, STI);
	else {
	// Branch to the mflo instruction.
	TOut.emitRRX(Mips::BNE, RsReg, ATReg, LabelOpEnd, IDLoc, STI);
	TOut.emitNop(IDLoc, STI);
	TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
	}

	TOut.getStreamer().EmitLabel(BrTargetEnd);
	TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
	return false;
	}

	bool MipsAsmParser::expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU,
	SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	assert(Inst.getNumOperands() == 3 && "Invalid operand count");
	assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() &&
	Inst.getOperand(2).isReg() && "Invalid instruction operand.");

	unsigned FirstReg = Inst.getOperand(0).getReg();
	unsigned SecondReg = Inst.getOperand(1).getReg();
	unsigned ThirdReg = Inst.getOperand(2).getReg();

	if (hasMips1() && !hasMips2()) {
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;
	TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
	TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
	TOut.emitNop(IDLoc, STI);
	TOut.emitRRI(Mips::ORi, ATReg, ThirdReg, 0x3, IDLoc, STI);
	TOut.emitRRI(Mips::XORi, ATReg, ATReg, 0x2, IDLoc, STI);
	TOut.emitRR(Mips::CTC1, Mips::RA, ATReg, IDLoc, STI);
	TOut.emitNop(IDLoc, STI);
	TOut.emitRR(IsDouble ? (Is64FPU ? Mips::CVT_W_D64 : Mips::CVT_W_D32)
	: Mips::CVT_W_S,
	FirstReg, SecondReg, IDLoc, STI);
	TOut.emitRR(Mips::CTC1, Mips::RA, ThirdReg, IDLoc, STI);
	TOut.emitNop(IDLoc, STI);
	return false;
	}

	TOut.emitRR(IsDouble ? (Is64FPU ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32)
	: Mips::TRUNC_W_S,
	FirstReg, SecondReg, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
	MCStreamer &Out, const MCSubtargetInfo *STI) {
	if (hasMips32r6() \|\| hasMips64r6()) {
	return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
	}

	const MCOperand &DstRegOp = Inst.getOperand(0);
	assert(DstRegOp.isReg() && "expected register operand kind");
	const MCOperand &SrcRegOp = Inst.getOperand(1);
	assert(SrcRegOp.isReg() && "expected register operand kind");
	const MCOperand &OffsetImmOp = Inst.getOperand(2);
	assert(OffsetImmOp.isImm() && "expected immediate operand kind");

	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned DstReg = DstRegOp.getReg();
	unsigned SrcReg = SrcRegOp.getReg();
	int64_t OffsetValue = OffsetImmOp.getImm();

	// NOTE: We always need AT for ULHU, as it is always used as the source
	// register for one of the LBu's.
	warnIfNoMacro(IDLoc);
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	bool IsLargeOffset = !(isInt<16>(OffsetValue + 1) && isInt<16>(OffsetValue));
	if (IsLargeOffset) {
	if (loadImmediate(OffsetValue, ATReg, SrcReg, !ABI.ArePtrs64bit(), true,
	IDLoc, Out, STI))
	return true;
	}

	int64_t FirstOffset = IsLargeOffset ? 0 : OffsetValue;
	int64_t SecondOffset = IsLargeOffset ? 1 : (OffsetValue + 1);
	if (isLittle())
	std::swap(FirstOffset, SecondOffset);

	unsigned FirstLbuDstReg = IsLargeOffset ? DstReg : ATReg;
	unsigned SecondLbuDstReg = IsLargeOffset ? ATReg : DstReg;

	unsigned LbuSrcReg = IsLargeOffset ? ATReg : SrcReg;
	unsigned SllReg = IsLargeOffset ? DstReg : ATReg;

	TOut.emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg,
	FirstOffset, IDLoc, STI);
	TOut.emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondOffset, IDLoc, STI);
	TOut.emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, STI);
	TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	if (hasMips32r6() \|\| hasMips64r6()) {
	return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
	}

	const MCOperand &DstRegOp = Inst.getOperand(0);
	assert(DstRegOp.isReg() && "expected register operand kind");
	const MCOperand &SrcRegOp = Inst.getOperand(1);
	assert(SrcRegOp.isReg() && "expected register operand kind");
	const MCOperand &OffsetImmOp = Inst.getOperand(2);
	assert(OffsetImmOp.isImm() && "expected immediate operand kind");

	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned DstReg = DstRegOp.getReg();
	unsigned SrcReg = SrcRegOp.getReg();
	int64_t OffsetValue = OffsetImmOp.getImm();

	warnIfNoMacro(IDLoc);
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	bool IsLargeOffset = !(isInt<16>(OffsetValue + 1) && isInt<16>(OffsetValue));
	if (IsLargeOffset) {
	if (loadImmediate(OffsetValue, ATReg, SrcReg, !ABI.ArePtrs64bit(), true,
	IDLoc, Out, STI))
	return true;
	}

	int64_t FirstOffset = IsLargeOffset ? 1 : (OffsetValue + 1);
	int64_t SecondOffset = IsLargeOffset ? 0 : OffsetValue;
	if (isLittle())
	std::swap(FirstOffset, SecondOffset);

	if (IsLargeOffset) {
	TOut.emitRRI(Mips::SB, DstReg, ATReg, FirstOffset, IDLoc, STI);
	TOut.emitRRI(Mips::SRL, DstReg, DstReg, 8, IDLoc, STI);
	TOut.emitRRI(Mips::SB, DstReg, ATReg, SecondOffset, IDLoc, STI);
	TOut.emitRRI(Mips::LBu, ATReg, ATReg, 0, IDLoc, STI);
	TOut.emitRRI(Mips::SLL, DstReg, DstReg, 8, IDLoc, STI);
	TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI);
	} else {
	TOut.emitRRI(Mips::SB, DstReg, SrcReg, FirstOffset, IDLoc, STI);
	TOut.emitRRI(Mips::SRL, ATReg, DstReg, 8, IDLoc, STI);
	TOut.emitRRI(Mips::SB, ATReg, SrcReg, SecondOffset, IDLoc, STI);
	}

	return false;
	}

	bool MipsAsmParser::expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	if (hasMips32r6() \|\| hasMips64r6()) {
	return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
	}

	const MCOperand &DstRegOp = Inst.getOperand(0);
	assert(DstRegOp.isReg() && "expected register operand kind");
	const MCOperand &SrcRegOp = Inst.getOperand(1);
	assert(SrcRegOp.isReg() && "expected register operand kind");
	const MCOperand &OffsetImmOp = Inst.getOperand(2);
	assert(OffsetImmOp.isImm() && "expected immediate operand kind");

	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned DstReg = DstRegOp.getReg();
	unsigned SrcReg = SrcRegOp.getReg();
	int64_t OffsetValue = OffsetImmOp.getImm();

	// Compute left/right load/store offsets.
	bool IsLargeOffset = !(isInt<16>(OffsetValue + 3) && isInt<16>(OffsetValue));
	int64_t LxlOffset = IsLargeOffset ? 0 : OffsetValue;
	int64_t LxrOffset = IsLargeOffset ? 3 : (OffsetValue + 3);
	if (isLittle())
	std::swap(LxlOffset, LxrOffset);

	bool IsLoadInst = (Inst.getOpcode() == Mips::Ulw);
	bool DoMove = IsLoadInst && (SrcReg == DstReg) && !IsLargeOffset;
	unsigned TmpReg = SrcReg;
	if (IsLargeOffset \|\| DoMove) {
	warnIfNoMacro(IDLoc);
	TmpReg = getATReg(IDLoc);
	if (!TmpReg)
	return true;
	}

	if (IsLargeOffset) {
	if (loadImmediate(OffsetValue, TmpReg, SrcReg, !ABI.ArePtrs64bit(), true,
	IDLoc, Out, STI))
	return true;
	}

	if (DoMove)
	std::swap(DstReg, TmpReg);

	unsigned XWL = IsLoadInst ? Mips::LWL : Mips::SWL;
	unsigned XWR = IsLoadInst ? Mips::LWR : Mips::SWR;
	TOut.emitRRI(XWL, DstReg, TmpReg, LxlOffset, IDLoc, STI);
	TOut.emitRRI(XWR, DstReg, TmpReg, LxrOffset, IDLoc, STI);

	if (DoMove)
	TOut.emitRRR(Mips::OR, TmpReg, DstReg, Mips::ZERO, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	assert(Inst.getNumOperands() == 3 && "Invalid operand count");
	assert(Inst.getOperand(0).isReg() &&
	Inst.getOperand(1).isReg() &&
	Inst.getOperand(2).isReg() && "Invalid instruction operand.");

	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	unsigned OpReg = Inst.getOperand(2).getReg();
	unsigned OpCode;

	warnIfNoMacro(IDLoc);

	switch (Inst.getOpcode()) {
	case Mips::SGE:
	OpCode = Mips::SLT;
	break;
	case Mips::SGEU:
	OpCode = Mips::SLTu;
	break;
	default:
	llvm_unreachable("unexpected 'sge' opcode");
	}

	// $SrcReg >= $OpReg is equal to (not ($SrcReg < $OpReg))
	TOut.emitRRR(OpCode, DstReg, SrcReg, OpReg, IDLoc, STI);
	TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	assert(Inst.getNumOperands() == 3 && "Invalid operand count");
	assert(Inst.getOperand(0).isReg() &&
	Inst.getOperand(1).isReg() &&
	Inst.getOperand(2).isImm() && "Invalid instruction operand.");

	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	int64_t ImmValue = Inst.getOperand(2).getImm();
	unsigned OpRegCode, OpImmCode;

	warnIfNoMacro(IDLoc);

	switch (Inst.getOpcode()) {
	case Mips::SGEImm:
	case Mips::SGEImm64:
	OpRegCode = Mips::SLT;
	OpImmCode = Mips::SLTi;
	break;
	case Mips::SGEUImm:
	case Mips::SGEUImm64:
	OpRegCode = Mips::SLTu;
	OpImmCode = Mips::SLTiu;
	break;
	default:
	llvm_unreachable("unexpected 'sge' opcode with immediate");
	}

	// $SrcReg >= Imm is equal to (not ($SrcReg < Imm))
	if (isInt<16>(ImmValue)) {
	// Use immediate version of STL.
	TOut.emitRRI(OpImmCode, DstReg, SrcReg, ImmValue, IDLoc, STI);
	TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
	} else {
	unsigned ImmReg = DstReg;
	if (DstReg == SrcReg) {
	unsigned ATReg = getATReg(Inst.getLoc());
	if (!ATReg)
	return true;
	ImmReg = ATReg;
	}

	if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
	false, IDLoc, Out, STI))
	return true;

	TOut.emitRRR(OpRegCode, DstReg, SrcReg, ImmReg, IDLoc, STI);
	TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
	}

	return false;
	}

	bool MipsAsmParser::expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	assert(Inst.getNumOperands() == 3 && "Invalid operand count");
	assert(Inst.getOperand(0).isReg() &&
	Inst.getOperand(1).isReg() &&
	Inst.getOperand(2).isImm() && "Invalid instruction operand.");

	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	unsigned ImmReg = DstReg;
	int64_t ImmValue = Inst.getOperand(2).getImm();
	unsigned OpCode;

	warnIfNoMacro(IDLoc);

	switch (Inst.getOpcode()) {
	case Mips::SGTImm:
	case Mips::SGTImm64:
	OpCode = Mips::SLT;
	break;
	case Mips::SGTUImm:
	case Mips::SGTUImm64:
	OpCode = Mips::SLTu;
	break;
	default:
	llvm_unreachable("unexpected 'sgt' opcode with immediate");
	}

	if (DstReg == SrcReg) {
	unsigned ATReg = getATReg(Inst.getLoc());
	if (!ATReg)
	return true;
	ImmReg = ATReg;
	}

	if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
	false, IDLoc, Out, STI))
	return true;

	// $SrcReg > $ImmReg is equal to $ImmReg < $SrcReg
	TOut.emitRRR(OpCode, DstReg, ImmReg, SrcReg, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	assert(Inst.getNumOperands() == 3 && "Invalid operand count");
	assert(Inst.getOperand(0).isReg() &&
	Inst.getOperand(1).isReg() &&
	Inst.getOperand(2).isImm() && "Invalid instruction operand.");

	unsigned ATReg = Mips::NoRegister;
	unsigned FinalDstReg = Mips::NoRegister;
	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	int64_t ImmValue = Inst.getOperand(2).getImm();

	bool Is32Bit = isInt<32>(ImmValue) \|\| (!isGP64bit() && isUInt<32>(ImmValue));

	unsigned FinalOpcode = Inst.getOpcode();

	if (DstReg == SrcReg) {
	ATReg = getATReg(Inst.getLoc());
	if (!ATReg)
	return true;
	FinalDstReg = DstReg;
	DstReg = ATReg;
	}

	if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false,
	Inst.getLoc(), Out, STI)) {
	switch (FinalOpcode) {
	default:
	llvm_unreachable("unimplemented expansion");
	case Mips::ADDi:
	FinalOpcode = Mips::ADD;
	break;
	case Mips::ADDiu:
	FinalOpcode = Mips::ADDu;
	break;
	case Mips::ANDi:
	FinalOpcode = Mips::AND;
	break;
	case Mips::NORImm:
	FinalOpcode = Mips::NOR;
	break;
	case Mips::ORi:
	FinalOpcode = Mips::OR;
	break;
	case Mips::SLTi:
	FinalOpcode = Mips::SLT;
	break;
	case Mips::SLTiu:
	FinalOpcode = Mips::SLTu;
	break;
	case Mips::XORi:
	FinalOpcode = Mips::XOR;
	break;
	case Mips::ADDi_MM:
	FinalOpcode = Mips::ADD_MM;
	break;
	case Mips::ADDiu_MM:
	FinalOpcode = Mips::ADDu_MM;
	break;
	case Mips::ANDi_MM:
	FinalOpcode = Mips::AND_MM;
	break;
	case Mips::ORi_MM:
	FinalOpcode = Mips::OR_MM;
	break;
	case Mips::SLTi_MM:
	FinalOpcode = Mips::SLT_MM;
	break;
	case Mips::SLTiu_MM:
	FinalOpcode = Mips::SLTu_MM;
	break;
	case Mips::XORi_MM:
	FinalOpcode = Mips::XOR_MM;
	break;
	case Mips::ANDi64:
	FinalOpcode = Mips::AND64;
	break;
	case Mips::NORImm64:
	FinalOpcode = Mips::NOR64;
	break;
	case Mips::ORi64:
	FinalOpcode = Mips::OR64;
	break;
	case Mips::SLTImm64:
	FinalOpcode = Mips::SLT64;
	break;
	case Mips::SLTUImm64:
	FinalOpcode = Mips::SLTu64;
	break;
	case Mips::XORi64:
	FinalOpcode = Mips::XOR64;
	break;
	}

	if (FinalDstReg == Mips::NoRegister)
	TOut.emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, STI);
	else
	TOut.emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc, STI);
	return false;
	}
	return true;
	}

	bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned ATReg = Mips::NoRegister;
	unsigned DReg = Inst.getOperand(0).getReg();
	unsigned SReg = Inst.getOperand(1).getReg();
	unsigned TReg = Inst.getOperand(2).getReg();
	unsigned TmpReg = DReg;

	unsigned FirstShift = Mips::NOP;
	unsigned SecondShift = Mips::NOP;

	if (hasMips32r2()) {
	if (DReg == SReg) {
	TmpReg = getATReg(Inst.getLoc());
	if (!TmpReg)
	return true;
	}

	if (Inst.getOpcode() == Mips::ROL) {
	TOut.emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
	TOut.emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI);
	return false;
	}

	if (Inst.getOpcode() == Mips::ROR) {
	TOut.emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), STI);
	return false;
	}

	return true;
	}

	if (hasMips32()) {
	switch (Inst.getOpcode()) {
	default:
	llvm_unreachable("unexpected instruction opcode");
	case Mips::ROL:
	FirstShift = Mips::SRLV;
	SecondShift = Mips::SLLV;
	break;
	case Mips::ROR:
	FirstShift = Mips::SLLV;
	SecondShift = Mips::SRLV;
	break;
	}

	ATReg = getATReg(Inst.getLoc());
	if (!ATReg)
	return true;

	TOut.emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
	TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI);
	TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI);
	TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);

	return false;
	}

	return true;
	}

	bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned ATReg = Mips::NoRegister;
	unsigned DReg = Inst.getOperand(0).getReg();
	unsigned SReg = Inst.getOperand(1).getReg();
	int64_t ImmValue = Inst.getOperand(2).getImm();

	unsigned FirstShift = Mips::NOP;
	unsigned SecondShift = Mips::NOP;

	if (hasMips32r2()) {
	if (Inst.getOpcode() == Mips::ROLImm) {
	uint64_t MaxShift = 32;
	uint64_t ShiftValue = ImmValue;
	if (ImmValue != 0)
	ShiftValue = MaxShift - ImmValue;
	TOut.emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), STI);
	return false;
	}

	if (Inst.getOpcode() == Mips::RORImm) {
	TOut.emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), STI);
	return false;
	}

	return true;
	}

	if (hasMips32()) {
	if (ImmValue == 0) {
	TOut.emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), STI);
	return false;
	}

	switch (Inst.getOpcode()) {
	default:
	llvm_unreachable("unexpected instruction opcode");
	case Mips::ROLImm:
	FirstShift = Mips::SLL;
	SecondShift = Mips::SRL;
	break;
	case Mips::RORImm:
	FirstShift = Mips::SRL;
	SecondShift = Mips::SLL;
	break;
	}

	ATReg = getATReg(Inst.getLoc());
	if (!ATReg)
	return true;

	TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), STI);
	TOut.emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), STI);
	TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);

	return false;
	}

	return true;
	}

	bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned ATReg = Mips::NoRegister;
	unsigned DReg = Inst.getOperand(0).getReg();
	unsigned SReg = Inst.getOperand(1).getReg();
	unsigned TReg = Inst.getOperand(2).getReg();
	unsigned TmpReg = DReg;

	unsigned FirstShift = Mips::NOP;
	unsigned SecondShift = Mips::NOP;

	if (hasMips64r2()) {
	if (TmpReg == SReg) {
	TmpReg = getATReg(Inst.getLoc());
	if (!TmpReg)
	return true;
	}

	if (Inst.getOpcode() == Mips::DROL) {
	TOut.emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
	TOut.emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI);
	return false;
	}

	if (Inst.getOpcode() == Mips::DROR) {
	TOut.emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), STI);
	return false;
	}

	return true;
	}

	if (hasMips64()) {
	switch (Inst.getOpcode()) {
	default:
	llvm_unreachable("unexpected instruction opcode");
	case Mips::DROL:
	FirstShift = Mips::DSRLV;
	SecondShift = Mips::DSLLV;
	break;
	case Mips::DROR:
	FirstShift = Mips::DSLLV;
	SecondShift = Mips::DSRLV;
	break;
	}

	ATReg = getATReg(Inst.getLoc());
	if (!ATReg)
	return true;

	TOut.emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
	TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI);
	TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI);
	TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);

	return false;
	}

	return true;
	}

	bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned ATReg = Mips::NoRegister;
	unsigned DReg = Inst.getOperand(0).getReg();
	unsigned SReg = Inst.getOperand(1).getReg();
	int64_t ImmValue = Inst.getOperand(2).getImm() % 64;

	unsigned FirstShift = Mips::NOP;
	unsigned SecondShift = Mips::NOP;

	MCInst TmpInst;

	if (hasMips64r2()) {
	unsigned FinalOpcode = Mips::NOP;
	if (ImmValue == 0)
	FinalOpcode = Mips::DROTR;
	else if (ImmValue % 32 == 0)
	FinalOpcode = Mips::DROTR32;
	else if ((ImmValue >= 1) && (ImmValue <= 32)) {
	if (Inst.getOpcode() == Mips::DROLImm)
	FinalOpcode = Mips::DROTR32;
	else
	FinalOpcode = Mips::DROTR;
	} else if (ImmValue >= 33) {
	if (Inst.getOpcode() == Mips::DROLImm)
	FinalOpcode = Mips::DROTR;
	else
	FinalOpcode = Mips::DROTR32;
	}

	uint64_t ShiftValue = ImmValue % 32;
	if (Inst.getOpcode() == Mips::DROLImm)
	ShiftValue = (32 - ImmValue % 32) % 32;

	TOut.emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), STI);

	return false;
	}

	if (hasMips64()) {
	if (ImmValue == 0) {
	TOut.emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), STI);
	return false;
	}

	switch (Inst.getOpcode()) {
	default:
	llvm_unreachable("unexpected instruction opcode");
	case Mips::DROLImm:
	if ((ImmValue >= 1) && (ImmValue <= 31)) {
	FirstShift = Mips::DSLL;
	SecondShift = Mips::DSRL32;
	}
	if (ImmValue == 32) {
	FirstShift = Mips::DSLL32;
	SecondShift = Mips::DSRL32;
	}
	if ((ImmValue >= 33) && (ImmValue <= 63)) {
	FirstShift = Mips::DSLL32;
	SecondShift = Mips::DSRL;
	}
	break;
	case Mips::DRORImm:
	if ((ImmValue >= 1) && (ImmValue <= 31)) {
	FirstShift = Mips::DSRL;
	SecondShift = Mips::DSLL32;
	}
	if (ImmValue == 32) {
	FirstShift = Mips::DSRL32;
	SecondShift = Mips::DSLL32;
	}
	if ((ImmValue >= 33) && (ImmValue <= 63)) {
	FirstShift = Mips::DSRL32;
	SecondShift = Mips::DSLL;
	}
	break;
	}

	ATReg = getATReg(Inst.getLoc());
	if (!ATReg)
	return true;

	TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), STI);
	TOut.emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32,
	Inst.getLoc(), STI);
	TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);

	return false;
	}

	return true;
	}

	bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned FirstRegOp = Inst.getOperand(0).getReg();
	unsigned SecondRegOp = Inst.getOperand(1).getReg();

	TOut.emitRI(Mips::BGEZ, SecondRegOp, 8, IDLoc, STI);
	if (FirstRegOp != SecondRegOp)
	TOut.emitRRR(Mips::ADDu, FirstRegOp, SecondRegOp, Mips::ZERO, IDLoc, STI);
	else
	TOut.emitEmptyDelaySlot(false, IDLoc, STI);
	TOut.emitRRR(Mips::SUB, FirstRegOp, Mips::ZERO, SecondRegOp, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned ATReg = Mips::NoRegister;
	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	int32_t ImmValue = Inst.getOperand(2).getImm();

	ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out,
	STI);

	TOut.emitRR(Inst.getOpcode() == Mips::MULImmMacro ? Mips::MULT : Mips::DMULT,
	SrcReg, ATReg, IDLoc, STI);

	TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned ATReg = Mips::NoRegister;
	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	unsigned TmpReg = Inst.getOperand(2).getReg();

	ATReg = getATReg(Inst.getLoc());
	if (!ATReg)
	return true;

	TOut.emitRR(Inst.getOpcode() == Mips::MULOMacro ? Mips::MULT : Mips::DMULT,
	SrcReg, TmpReg, IDLoc, STI);

	TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);

	TOut.emitRRI(Inst.getOpcode() == Mips::MULOMacro ? Mips::SRA : Mips::DSRA32,
	DstReg, DstReg, 0x1F, IDLoc, STI);

	TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI);

	if (useTraps()) {
	TOut.emitRRI(Mips::TNE, DstReg, ATReg, 6, IDLoc, STI);
	} else {
	MCContext & Context = TOut.getStreamer().getContext();
	MCSymbol * BrTarget = Context.createTempSymbol();
	MCOperand LabelOp =
	MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));

	TOut.emitRRX(Mips::BEQ, DstReg, ATReg, LabelOp, IDLoc, STI);
	if (AssemblerOptions.back()->isReorder())
	TOut.emitNop(IDLoc, STI);
	TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);

	TOut.getStreamer().EmitLabel(BrTarget);
	}
	TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned ATReg = Mips::NoRegister;
	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	unsigned TmpReg = Inst.getOperand(2).getReg();

	ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	TOut.emitRR(Inst.getOpcode() == Mips::MULOUMacro ? Mips::MULTu : Mips::DMULTu,
	SrcReg, TmpReg, IDLoc, STI);

	TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI);
	TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
	if (useTraps()) {
	TOut.emitRRI(Mips::TNE, ATReg, Mips::ZERO, 6, IDLoc, STI);
	} else {
	MCContext & Context = TOut.getStreamer().getContext();
	MCSymbol * BrTarget = Context.createTempSymbol();
	MCOperand LabelOp =
	MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));

	TOut.emitRRX(Mips::BEQ, ATReg, Mips::ZERO, LabelOp, IDLoc, STI);
	if (AssemblerOptions.back()->isReorder())
	TOut.emitNop(IDLoc, STI);
	TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);

	TOut.getStreamer().EmitLabel(BrTarget);
	}

	return false;
	}

	bool MipsAsmParser::expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	unsigned TmpReg = Inst.getOperand(2).getReg();

	TOut.emitRR(Mips::DMULTu, SrcReg, TmpReg, IDLoc, STI);
	TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);

	return false;
	}

	// Expand 'ld $<reg> offset($reg2)' to 'lw $<reg>, offset($reg2);
	// lw $<reg+1>>, offset+4($reg2)'
	// or expand 'sd $<reg> offset($reg2)' to 'sw $<reg>, offset($reg2);
	// sw $<reg+1>>, offset+4($reg2)'
	// for O32.
	bool MipsAsmParser::expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI,
	bool IsLoad) {
	if (!isABI_O32())
	return true;

	warnIfNoMacro(IDLoc);

	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned Opcode = IsLoad ? Mips::LW : Mips::SW;
	unsigned FirstReg = Inst.getOperand(0).getReg();
	unsigned SecondReg = nextReg(FirstReg);
	unsigned BaseReg = Inst.getOperand(1).getReg();
	if (!SecondReg)
	return true;

	warnIfRegIndexIsAT(FirstReg, IDLoc);

	assert(Inst.getOperand(2).isImm() &&
	"Offset for load macro is not immediate!");

	MCOperand &FirstOffset = Inst.getOperand(2);
	signed NextOffset = FirstOffset.getImm() + 4;
	MCOperand SecondOffset = MCOperand::createImm(NextOffset);

	if (!isInt<16>(FirstOffset.getImm()) \|\| !isInt<16>(NextOffset))
	return true;

	// For loads, clobber the base register with the second load instead of the
	// first if the BaseReg == FirstReg.
	if (FirstReg != BaseReg \|\| !IsLoad) {
	TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI);
	TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI);
	} else {
	TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI);
	TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI);
	}

	return false;
	}


	// Expand 's.d $<reg> offset($reg2)' to 'swc1 $<reg+1>, offset($reg2);
	// swc1 $<reg>, offset+4($reg2)'
	// or if little endian to 'swc1 $<reg>, offset($reg2);
	// swc1 $<reg+1>, offset+4($reg2)'
	// for Mips1.
	bool MipsAsmParser::expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	if (!isABI_O32())
	return true;

	warnIfNoMacro(IDLoc);

	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned Opcode = Mips::SWC1;
	unsigned FirstReg = Inst.getOperand(0).getReg();
	unsigned SecondReg = nextReg(FirstReg);
	unsigned BaseReg = Inst.getOperand(1).getReg();
	if (!SecondReg)
	return true;

	warnIfRegIndexIsAT(FirstReg, IDLoc);

	assert(Inst.getOperand(2).isImm() &&
	"Offset for macro is not immediate!");

	MCOperand &FirstOffset = Inst.getOperand(2);
	signed NextOffset = FirstOffset.getImm() + 4;
	MCOperand SecondOffset = MCOperand::createImm(NextOffset);

	if (!isInt<16>(FirstOffset.getImm()) \|\| !isInt<16>(NextOffset))
	return true;

	if (!IsLittleEndian)
	std::swap(FirstReg, SecondReg);

	TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI);
	TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI);

	return false;
	}

	bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	assert(Inst.getNumOperands() == 3 && "Invalid operand count");
	assert(Inst.getOperand(0).isReg() &&
	Inst.getOperand(1).isReg() &&
	Inst.getOperand(2).isReg() && "Invalid instruction operand.");

	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	unsigned OpReg = Inst.getOperand(2).getReg();

	warnIfNoMacro(IDLoc);

	if (SrcReg != Mips::ZERO && OpReg != Mips::ZERO) {
	TOut.emitRRR(Mips::XOR, DstReg, SrcReg, OpReg, IDLoc, STI);
	TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI);
	return false;
	}

	unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg;
	TOut.emitRRI(Mips::SLTiu, DstReg, Reg, 1, IDLoc, STI);
	return false;
	}

	bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();

	assert(Inst.getNumOperands() == 3 && "Invalid operand count");
	assert(Inst.getOperand(0).isReg() &&
	Inst.getOperand(1).isReg() &&
	Inst.getOperand(2).isImm() && "Invalid instruction operand.");

	unsigned DstReg = Inst.getOperand(0).getReg();
	unsigned SrcReg = Inst.getOperand(1).getReg();
	int64_t Imm = Inst.getOperand(2).getImm();

	warnIfNoMacro(IDLoc);

	if (Imm == 0) {
	TOut.emitRRI(Mips::SLTiu, DstReg, SrcReg, 1, IDLoc, STI);
	return false;
	}

	if (SrcReg == Mips::ZERO) {
	Warning(IDLoc, "comparison is always false");
	TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu,
	DstReg, SrcReg, SrcReg, IDLoc, STI);
	return false;
	}

	unsigned Opc;
	if (Imm > -0x8000 && Imm < 0) {
	Imm = -Imm;
	Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu;
	} else {
	Opc = Mips::XORi;
	}

	if (!isUInt<16>(Imm)) {
	unsigned ATReg = getATReg(IDLoc);
	if (!ATReg)
	return true;

	if (loadImmediate(Imm, ATReg, Mips::NoRegister, true, isGP64bit(), IDLoc,
	Out, STI))
	return true;

	TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI);
	TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI);
	return false;
	}

	TOut.emitRRI(Opc, DstReg, SrcReg, Imm, IDLoc, STI);
	TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI);
	return false;
	}

	// Map the DSP accumulator and control register to the corresponding gpr
	// operand. Unlike the other alias, the m(f\|t)t(lo\|hi\|acx) instructions
	// do not map the DSP registers contigously to gpr registers.
	static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) {
	switch (Inst.getOpcode()) {
	case Mips::MFTLO:
	case Mips::MTTLO:
	switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
	case Mips::AC0:
	return Mips::ZERO;
	case Mips::AC1:
	return Mips::A0;
	case Mips::AC2:
	return Mips::T0;
	case Mips::AC3:
	return Mips::T4;
	default:
	llvm_unreachable("Unknown register for 'mttr' alias!");
	}
	case Mips::MFTHI:
	case Mips::MTTHI:
	switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
	case Mips::AC0:
	return Mips::AT;
	case Mips::AC1:
	return Mips::A1;
	case Mips::AC2:
	return Mips::T1;
	case Mips::AC3:
	return Mips::T5;
	default:
	llvm_unreachable("Unknown register for 'mttr' alias!");
	}
	case Mips::MFTACX:
	case Mips::MTTACX:
	switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
	case Mips::AC0:
	return Mips::V0;
	case Mips::AC1:
	return Mips::A2;
	case Mips::AC2:
	return Mips::T2;
	case Mips::AC3:
	return Mips::T6;
	default:
	llvm_unreachable("Unknown register for 'mttr' alias!");
	}
	case Mips::MFTDSP:
	case Mips::MTTDSP:
	return Mips::S0;
	default:
	llvm_unreachable("Unknown instruction for 'mttr' dsp alias!");
	}
	}

	// Map the floating point register operand to the corresponding register
	// operand.
	static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) {
	switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) {
	case Mips::F0: return Mips::ZERO;
	case Mips::F1: return Mips::AT;
	case Mips::F2: return Mips::V0;
	case Mips::F3: return Mips::V1;
	case Mips::F4: return Mips::A0;
	case Mips::F5: return Mips::A1;
	case Mips::F6: return Mips::A2;
	case Mips::F7: return Mips::A3;
	case Mips::F8: return Mips::T0;
	case Mips::F9: return Mips::T1;
	case Mips::F10: return Mips::T2;
	case Mips::F11: return Mips::T3;
	case Mips::F12: return Mips::T4;
	case Mips::F13: return Mips::T5;
	case Mips::F14: return Mips::T6;
	case Mips::F15: return Mips::T7;
	case Mips::F16: return Mips::S0;
	case Mips::F17: return Mips::S1;
	case Mips::F18: return Mips::S2;
	case Mips::F19: return Mips::S3;
	case Mips::F20: return Mips::S4;
	case Mips::F21: return Mips::S5;
	case Mips::F22: return Mips::S6;
	case Mips::F23: return Mips::S7;
	case Mips::F24: return Mips::T8;
	case Mips::F25: return Mips::T9;
	case Mips::F26: return Mips::K0;
	case Mips::F27: return Mips::K1;
	case Mips::F28: return Mips::GP;
	case Mips::F29: return Mips::SP;
	case Mips::F30: return Mips::FP;
	case Mips::F31: return Mips::RA;
	default: llvm_unreachable("Unknown register for mttc1 alias!");
	}
	}

	// Map the coprocessor operand the corresponding gpr register operand.
	static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) {
	switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) {
	case Mips::COP00: return Mips::ZERO;
	case Mips::COP01: return Mips::AT;
	case Mips::COP02: return Mips::V0;
	case Mips::COP03: return Mips::V1;
	case Mips::COP04: return Mips::A0;
	case Mips::COP05: return Mips::A1;
	case Mips::COP06: return Mips::A2;
	case Mips::COP07: return Mips::A3;
	case Mips::COP08: return Mips::T0;
	case Mips::COP09: return Mips::T1;
	case Mips::COP010: return Mips::T2;
	case Mips::COP011: return Mips::T3;
	case Mips::COP012: return Mips::T4;
	case Mips::COP013: return Mips::T5;
	case Mips::COP014: return Mips::T6;
	case Mips::COP015: return Mips::T7;
	case Mips::COP016: return Mips::S0;
	case Mips::COP017: return Mips::S1;
	case Mips::COP018: return Mips::S2;
	case Mips::COP019: return Mips::S3;
	case Mips::COP020: return Mips::S4;
	case Mips::COP021: return Mips::S5;
	case Mips::COP022: return Mips::S6;
	case Mips::COP023: return Mips::S7;
	case Mips::COP024: return Mips::T8;
	case Mips::COP025: return Mips::T9;
	case Mips::COP026: return Mips::K0;
	case Mips::COP027: return Mips::K1;
	case Mips::COP028: return Mips::GP;
	case Mips::COP029: return Mips::SP;
	case Mips::COP030: return Mips::FP;
	case Mips::COP031: return Mips::RA;
	default: llvm_unreachable("Unknown register for mttc0 alias!");
	}
	}

	/// Expand an alias of 'mftr' or 'mttr' into the full instruction, by producing
	/// an mftr or mttr with the correctly mapped gpr register, u, sel and h bits.
	bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
	const MCSubtargetInfo *STI) {
	MipsTargetStreamer &TOut = getTargetStreamer();
	unsigned rd = 0;
	unsigned u = 1;
	unsigned sel = 0;
	unsigned h = 0;
	bool IsMFTR = false;
	switch (Inst.getOpcode()) {
	case Mips::MFTC0:
	IsMFTR = true;
	LLVM_FALLTHROUGH;
	case Mips::MTTC0:
	u = 0;
	rd = getRegisterForMxtrC0(Inst, IsMFTR);
	sel = Inst.getOperand(2).getImm();
	break;
	case Mips::MFTGPR:
	IsMFTR = true;
	LLVM_FALLTHROUGH;
	case Mips::MTTGPR:
	rd = Inst.getOperand(IsMFTR ? 1 : 0).getReg();
	break;
	case Mips::MFTLO:
	case Mips::MFTHI:
	case Mips::MFTACX:
	case Mips::MFTDSP:
	IsMFTR = true;
	LLVM_FALLTHROUGH;
	case Mips::MTTLO:
	case Mips::MTTHI:
	case Mips::MTTACX:
	case Mips::MTTDSP:
	rd = getRegisterForMxtrDSP(Inst, IsMFTR);
	sel = 1;
	break;
	case Mips::MFTHC1:
	h = 1;
	LLVM_FALLTHROUGH;
	case Mips::MFTC1:
	IsMFTR = true;
	rd = getRegisterForMxtrFP(Inst, IsMFTR);
	sel = 2;
	break;
	case Mips::MTTHC1:
	h = 1;
	LLVM_FALLTHROUGH;
	case Mips::MTTC1:
	rd = getRegisterForMxtrFP(Inst, IsMFTR);
	sel = 2;
	break;
	case Mips::CFTC1:
	IsMFTR = true;
	LLVM_FALLTHROUGH;
	case Mips::CTTC1:
	rd = getRegisterForMxtrFP(Inst, IsMFTR);
	sel = 3;
	break;
	}
	unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd;
	unsigned Op1 =
	IsMFTR ? rd
	: (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg()
	: Inst.getOperand(0).getReg());

	TOut.emitRRIII(IsMFTR ? Mips::MFTR : Mips::MTTR, Op0, Op1, u, sel, h, IDLoc,
	STI);
	return false;
	}

	unsigned
	MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
	const OperandVector &Operands) {
	switch (Inst.getOpcode()) {
	default:
	return Match_Success;
	case Mips::DATI:
	case Mips::DAHI:
	if (static_cast<MipsOperand &>(*Operands[1])
	.isValidForTie(static_cast<MipsOperand &>(*Operands[2])))
	return Match_Success;
	return Match_RequiresSameSrcAndDst;
	}
	}

	unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
	switch (Inst.getOpcode()) {
	// As described by the MIPSR6 spec, daui must not use the zero operand for
	// its source operand.
	case Mips::DAUI:
	if (Inst.getOperand(1).getReg() == Mips::ZERO \|\|
	Inst.getOperand(1).getReg() == Mips::ZERO_64)
	return Match_RequiresNoZeroRegister;
	return Match_Success;
	// As described by the Mips32r2 spec, the registers Rd and Rs for
	// jalr.hb must be different.
	// It also applies for registers Rt and Rs of microMIPSr6 jalrc.hb instruction
	// and registers Rd and Base for microMIPS lwp instruction
	case Mips::JALR_HB:
	case Mips::JALR_HB64:
	case Mips::JALRC_HB_MMR6:
	case Mips::JALRC_MMR6:
	if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
	return Match_RequiresDifferentSrcAndDst;
	return Match_Success;
	case Mips::LWP_MM:
	if (Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg())
	return Match_RequiresDifferentSrcAndDst;
	return Match_Success;
	case Mips::SYNC:
	if (Inst.getOperand(0).getImm() != 0 && !hasMips32())
	return Match_NonZeroOperandForSync;
	return Match_Success;
	case Mips::MFC0:
	case Mips::MTC0:
	case Mips::MTC2:
	case Mips::MFC2:
	if (Inst.getOperand(2).getImm() != 0 && !hasMips32())
	return Match_NonZeroOperandForMTCX;
	return Match_Success;
	// As described the MIPSR6 spec, the compact branches that compare registers
	// must:
	// a) Not use the zero register.
	// b) Not use the same register twice.
	// c) rs < rt for bnec, beqc.
	// NB: For this case, the encoding will swap the operands as their
	// ordering doesn't matter. GAS performs this transformation too.
	// Hence, that constraint does not have to be enforced.
	//
	// The compact branches that branch iff the signed addition of two registers
	// would overflow must have rs >= rt. That can be handled like beqc/bnec with
	// operand swapping. They do not have restriction of using the zero register.
	case Mips::BLEZC: case Mips::BLEZC_MMR6:
	case Mips::BGEZC: case Mips::BGEZC_MMR6:
	case Mips::BGTZC: case Mips::BGTZC_MMR6:
	case Mips::BLTZC: case Mips::BLTZC_MMR6:
	case Mips::BEQZC: case Mips::BEQZC_MMR6:
	case Mips::BNEZC: case Mips::BNEZC_MMR6:
	case Mips::BLEZC64:
	case Mips::BGEZC64:
	case Mips::BGTZC64:
	case Mips::BLTZC64:
	case Mips::BEQZC64:
	case Mips::BNEZC64:
	if (Inst.getOperand(0).getReg() == Mips::ZERO \|\|
	Inst.getOperand(0).getReg() == Mips::ZERO_64)
	return Match_RequiresNoZeroRegister;
	return Match_Success;
	case Mips::BGEC: case Mips::BGEC_MMR6:
	case Mips::BLTC: case Mips::BLTC_MMR6:
	case Mips::BGEUC: case Mips::BGEUC_MMR6:
	case Mips::BLTUC: case Mips::BLTUC_MMR6:
	case Mips::BEQC: case Mips::BEQC_MMR6:
	case Mips::BNEC: case Mips::BNEC_MMR6:
	case Mips::BGEC64:
	case Mips::BLTC64:
	case Mips::BGEUC64:
	case Mips::BLTUC64:
	case Mips::BEQC64:
	case Mips::BNEC64:
	if (Inst.getOperand(0).getReg() == Mips::ZERO \|\|
	Inst.getOperand(0).getReg() == Mips::ZERO_64)
	return Match_RequiresNoZeroRegister;
	if (Inst.getOperand(1).getReg() == Mips::ZERO \|\|
	Inst.getOperand(1).getReg() == Mips::ZERO_64)
	return Match_RequiresNoZeroRegister;
	if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
	return Match_RequiresDifferentOperands;
	return Match_Success;
	case Mips::DINS: {
	assert(Inst.getOperand(2).isImm() && Inst.getOperand(3).isImm() &&
	"Operands must be immediates for dins!");
	const signed Pos = Inst.getOperand(2).getImm();
	const signed Size = Inst.getOperand(3).getImm();
	if ((0 > (Pos + Size)) \|\| ((Pos + Size) > 32))
	return Match_RequiresPosSizeRange0_32;
	return Match_Success;
	}
	case Mips::DINSM:
	case Mips::DINSU: {
	assert(Inst.getOperand(2).isImm() && Inst.getOperand(3).isImm() &&
	"Operands must be immediates for dinsm/dinsu!");
	const signed Pos = Inst.getOperand(2).getImm();
	const signed Size = Inst.getOperand(3).getImm();
	if ((32 >= (Pos + Size)) \|\| ((Pos + Size) > 64))
	return Match_RequiresPosSizeRange33_64;
	return Match_Success;
	}
	case Mips::DEXT: {
	assert(Inst.getOperand(2).isImm() && Inst.getOperand(3).isImm() &&
	"Operands must be immediates for DEXTM!");
	const signed Pos = Inst.getOperand(2).getImm();
	const signed Size = Inst.getOperand(3).getImm();
	if ((1 > (Pos + Size)) \|\| ((Pos + Size) > 63))
	return Match_RequiresPosSizeUImm6;
	return Match_Success;
	}
	case Mips::DEXTM:
	case Mips::DEXTU: {
	assert(Inst.getOperand(2).isImm() && Inst.getOperand(3).isImm() &&
	"Operands must be immediates for dextm/dextu!");
	const signed Pos = Inst.getOperand(2).getImm();
	const signed Size = Inst.getOperand(3).getImm();
	if ((32 > (Pos + Size)) \|\| ((Pos + Size) > 64))
	return Match_RequiresPosSizeRange33_64;
	return Match_Success;
	}
	case Mips::CRC32B: case Mips::CRC32CB:
	case Mips::CRC32H: case Mips::CRC32CH:
	case Mips::CRC32W: case Mips::CRC32CW:
	case Mips::CRC32D: case Mips::CRC32CD:
	if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg())
	return Match_RequiresSameSrcAndDst;
	return Match_Success;
	}

	uint64_t TSFlags = getInstDesc(Inst.getOpcode()).TSFlags;
	if ((TSFlags & MipsII::HasFCCRegOperand) &&
	(Inst.getOperand(0).getReg() != Mips::FCC0) && !hasEightFccRegisters())
	return Match_NoFCCRegisterForCurrentISA;

	return Match_Success;

	}

	static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands,
	uint64_t ErrorInfo) {
	if (ErrorInfo != ~0ULL && ErrorInfo < Operands.size()) {
	SMLoc ErrorLoc = Operands[ErrorInfo]->getStartLoc();
	if (ErrorLoc == SMLoc())
	return Loc;
	return ErrorLoc;
	}
	return Loc;
	}

	bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands,
	MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) {
	MCInst Inst;
	unsigned MatchResult =
	MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);

	switch (MatchResult) {
	case Match_Success:
	if (processInstruction(Inst, IDLoc, Out, STI))
	return true;
	return false;
	case Match_MissingFeature:
	Error(IDLoc, "instruction requires a CPU feature not currently enabled");
	return true;
	case Match_InvalidOperand: {
	SMLoc ErrorLoc = IDLoc;
	if (ErrorInfo != ~0ULL) {
	if (ErrorInfo >= Operands.size())
	return Error(IDLoc, "too few operands for instruction");

	ErrorLoc = Operands[ErrorInfo]->getStartLoc();
	if (ErrorLoc == SMLoc())
	ErrorLoc = IDLoc;
	}

	return Error(ErrorLoc, "invalid operand for instruction");
	}
	case Match_NonZeroOperandForSync:
	return Error(IDLoc,
	"s-type must be zero or unspecified for pre-MIPS32 ISAs");
	case Match_NonZeroOperandForMTCX:
	return Error(IDLoc, "selector must be zero for pre-MIPS32 ISAs");
	case Match_MnemonicFail:
	return Error(IDLoc, "invalid instruction");
	case Match_RequiresDifferentSrcAndDst:
	return Error(IDLoc, "source and destination must be different");
	case Match_RequiresDifferentOperands:
	return Error(IDLoc, "registers must be different");
	case Match_RequiresNoZeroRegister:
	return Error(IDLoc, "invalid operand ($zero) for instruction");
	case Match_RequiresSameSrcAndDst:
	return Error(IDLoc, "source and destination must match");
	case Match_NoFCCRegisterForCurrentISA:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"non-zero fcc register doesn't exist in current ISA level");
	case Match_Immz:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'");
	case Match_UImm1_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 1-bit unsigned immediate");
	case Match_UImm2_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 2-bit unsigned immediate");
	case Match_UImm2_1:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected immediate in range 1 .. 4");
	case Match_UImm3_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 3-bit unsigned immediate");
	case Match_UImm4_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 4-bit unsigned immediate");
	case Match_SImm4_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 4-bit signed immediate");
	case Match_UImm5_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 5-bit unsigned immediate");
	case Match_SImm5_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 5-bit signed immediate");
	case Match_UImm5_1:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected immediate in range 1 .. 32");
	case Match_UImm5_32:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected immediate in range 32 .. 63");
	case Match_UImm5_33:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected immediate in range 33 .. 64");
	case Match_UImm5_0_Report_UImm6:
	// This is used on UImm5 operands that have a corresponding UImm5_32
	// operand to avoid confusing the user.
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 6-bit unsigned immediate");
	case Match_UImm5_Lsl2:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected both 7-bit unsigned immediate and multiple of 4");
	case Match_UImmRange2_64:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected immediate in range 2 .. 64");
	case Match_UImm6_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 6-bit unsigned immediate");
	case Match_UImm6_Lsl2:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected both 8-bit unsigned immediate and multiple of 4");
	case Match_SImm6_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 6-bit signed immediate");
	case Match_UImm7_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 7-bit unsigned immediate");
	case Match_UImm7_N1:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected immediate in range -1 .. 126");
	case Match_SImm7_Lsl2:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected both 9-bit signed immediate and multiple of 4");
	case Match_UImm8_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 8-bit unsigned immediate");
	case Match_UImm10_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 10-bit unsigned immediate");
	case Match_SImm10_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 10-bit signed immediate");
	case Match_SImm11_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 11-bit signed immediate");
	case Match_UImm16:
	case Match_UImm16_Relaxed:
	case Match_UImm16_AltRelaxed:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 16-bit unsigned immediate");
	case Match_SImm16:
	case Match_SImm16_Relaxed:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 16-bit signed immediate");
	case Match_SImm19_Lsl2:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected both 19-bit signed immediate and multiple of 4");
	case Match_UImm20_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 20-bit unsigned immediate");
	case Match_UImm26_0:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 26-bit unsigned immediate");
	case Match_SImm32:
	case Match_SImm32_Relaxed:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 32-bit signed immediate");
	case Match_UImm32_Coerced:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected 32-bit immediate");
	case Match_MemSImm9:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 9-bit signed offset");
	case Match_MemSImm10:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 10-bit signed offset");
	case Match_MemSImm10Lsl1:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 11-bit signed offset and multiple of 2");
	case Match_MemSImm10Lsl2:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 12-bit signed offset and multiple of 4");
	case Match_MemSImm10Lsl3:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 13-bit signed offset and multiple of 8");
	case Match_MemSImm11:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 11-bit signed offset");
	case Match_MemSImm12:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 12-bit signed offset");
	case Match_MemSImm16:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 16-bit signed offset");
	case Match_MemSImmPtr:
	return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
	"expected memory with 32-bit signed offset");
	case Match_RequiresPosSizeRange0_32: {
	SMLoc ErrorStart = Operands[3]->getStartLoc();
	SMLoc ErrorEnd = Operands[4]->getEndLoc();
	return Error(ErrorStart, "size plus position are not in the range 0 .. 32",
	SMRange(ErrorStart, ErrorEnd));
	}
	case Match_RequiresPosSizeUImm6: {
	SMLoc ErrorStart = Operands[3]->getStartLoc();
	SMLoc ErrorEnd = Operands[4]->getEndLoc();
	return Error(ErrorStart, "size plus position are not in the range 1 .. 63",
	SMRange(ErrorStart, ErrorEnd));
	}
	case Match_RequiresPosSizeRange33_64: {
	SMLoc ErrorStart = Operands[3]->getStartLoc();
	SMLoc ErrorEnd = Operands[4]->getEndLoc();
	return Error(ErrorStart, "size plus position are not in the range 33 .. 64",
	SMRange(ErrorStart, ErrorEnd));
	}
	}

	llvm_unreachable("Implement any new match types added!");
	}

	void MipsAsmParser::warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc) {
	if (RegIndex != 0 && AssemblerOptions.back()->getATRegIndex() == RegIndex)
	Warning(Loc, "used $at (currently $" + Twine(RegIndex) +
	") without \".set noat\"");
	}

	void MipsAsmParser::warnIfNoMacro(SMLoc Loc) {
	if (!AssemblerOptions.back()->isMacro())
	Warning(Loc, "macro instruction expanded into multiple instructions");
	}

	void MipsAsmParser::ConvertXWPOperands(MCInst &Inst,
	const OperandVector &Operands) {
	assert(
	(Inst.getOpcode() == Mips::LWP_MM \|\| Inst.getOpcode() == Mips::SWP_MM) &&
	"Unexpected instruction!");
	((MipsOperand &)*Operands[1]).addGPR32ZeroAsmRegOperands(Inst, 1);
	int NextReg = nextReg(((MipsOperand &)*Operands[1]).getGPR32Reg());
	Inst.addOperand(MCOperand::createReg(NextReg));
	((MipsOperand &)*Operands[2]).addMemOperands(Inst, 2);
	}

	void
	MipsAsmParser::printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
	SMRange Range, bool ShowColors) {
	getSourceManager().PrintMessage(Range.Start, SourceMgr::DK_Warning, Msg,
	Range, SMFixIt(Range, FixMsg),
	ShowColors);
	}

	int MipsAsmParser::matchCPURegisterName(StringRef Name) {
	int CC;

	CC = StringSwitch<unsigned>(Name)
	.Case("zero", 0)
	.Cases("at", "AT", 1)
	.Case("a0", 4)
	.Case("a1", 5)
	.Case("a2", 6)
	.Case("a3", 7)
	.Case("v0", 2)
	.Case("v1", 3)
	.Case("s0", 16)
	.Case("s1", 17)
	.Case("s2", 18)
	.Case("s3", 19)
	.Case("s4", 20)
	.Case("s5", 21)
	.Case("s6", 22)
	.Case("s7", 23)
	.Case("k0", 26)
	.Case("k1", 27)
	.Case("gp", 28)
	.Case("sp", 29)
	.Case("fp", 30)
	.Case("s8", 30)
	.Case("ra", 31)
	.Case("t0", 8)
	.Case("t1", 9)
	.Case("t2", 10)
	.Case("t3", 11)
	.Case("t4", 12)
	.Case("t5", 13)
	.Case("t6", 14)
	.Case("t7", 15)
	.Case("t8", 24)
	.Case("t9", 25)
	.Default(-1);

	if (!(isABI_N32() \|\| isABI_N64()))
	return CC;

	if (12 <= CC && CC <= 15) {
	// Name is one of t4-t7
	AsmToken RegTok = getLexer().peekTok();
	SMRange RegRange = RegTok.getLocRange();

	StringRef FixedName = StringSwitch<StringRef>(Name)
	.Case("t4", "t0")
	.Case("t5", "t1")
	.Case("t6", "t2")
	.Case("t7", "t3")
	.Default("");
	assert(FixedName != "" && "Register name is not one of t4-t7.");

	printWarningWithFixIt("register names $t4-$t7 are only available in O32.",
	"Did you mean $" + FixedName + "?", RegRange);
	}

	// Although SGI documentation just cuts out t0-t3 for n32/n64,
	// GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
	// We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
	if (8 <= CC && CC <= 11)
	CC += 4;

	if (CC == -1)
	CC = StringSwitch<unsigned>(Name)
	.Case("a4", 8)
	.Case("a5", 9)
	.Case("a6", 10)
	.Case("a7", 11)
	.Case("kt0", 26)
	.Case("kt1", 27)
	.Default(-1);

	return CC;
	}

	int MipsAsmParser::matchHWRegsRegisterName(StringRef Name) {
	int CC;

	CC = StringSwitch<unsigned>(Name)
	.Case("hwr_cpunum", 0)
	.Case("hwr_synci_step", 1)
	.Case("hwr_cc", 2)
	.Case("hwr_ccres", 3)
	.Case("hwr_ulr", 29)
	.Default(-1);

	return CC;
	}

	int MipsAsmParser::matchFPURegisterName(StringRef Name) {
	if (Name[0] == 'f') {
	StringRef NumString = Name.substr(1);
	unsigned IntVal;
	if (NumString.getAsInteger(10, IntVal))
	return -1; // This is not an integer.
	if (IntVal > 31) // Maximum index for fpu register.
	return -1;
	return IntVal;
	}
	return -1;
	}

	int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
	if (Name.startswith("fcc")) {
	StringRef NumString = Name.substr(3);
	unsigned IntVal;
	if (NumString.getAsInteger(10, IntVal))
	return -1; // This is not an integer.
	if (IntVal > 7) // There are only 8 fcc registers.
	return -1;
	return IntVal;
	}
	return -1;
	}

	int MipsAsmParser::matchACRegisterName(StringRef Name) {
	if (Name.startswith("ac")) {
	StringRef NumString = Name.substr(2);
	unsigned IntVal;
	if (NumString.getAsInteger(10, IntVal))
	return -1; // This is not an integer.
	if (IntVal > 3) // There are only 3 acc registers.
	return -1;
	return IntVal;
	}
	return -1;
	}

	int MipsAsmParser::matchMSA128RegisterName(StringRef Name) {
	unsigned IntVal;

	if (Name.front() != 'w' \|\| Name.drop_front(1).getAsInteger(10, IntVal))
	return -1;

	if (IntVal > 31)
	return -1;

	return IntVal;
	}

	int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
	int CC;

	CC = StringSwitch<unsigned>(Name)
	.Case("msair", 0)
	.Case("msacsr", 1)
	.Case("msaaccess", 2)
	.Case("msasave", 3)
	.Case("msamodify", 4)
	.Case("msarequest", 5)
	.Case("msamap", 6)
	.Case("msaunmap", 7)
	.Default(-1);

	return CC;
	}

	bool MipsAsmParser::canUseATReg() {
	return AssemblerOptions.back()->getATRegIndex() != 0;
	}

	unsigned MipsAsmParser::getATReg(SMLoc Loc) {
	unsigned ATIndex = AssemblerOptions.back()->getATRegIndex();
	if (ATIndex == 0) {
	reportParseError(Loc,
	"pseudo-instruction requires $at, which is not available");
	return 0;
	}
	unsigned AT = getReg(
	(isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, ATIndex);
	return AT;
	}

	unsigned MipsAsmParser::getReg(int RC, int RegNo) {
	return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
	}

	bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
	MCAsmParser &Parser = getParser();
	LLVM_DEBUG(dbgs() << "parseOperand\n");

	// Check if the current operand has a custom associated parser, if so, try to
	// custom parse the operand, or fallback to the general approach.
	OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
	if (ResTy == MatchOperand_Success)
	return false;
	// If there wasn't a custom match, try the generic matcher below. Otherwise,
	// there was a match, but an error occurred, in which case, just return that
	// the operand parsing failed.
	if (ResTy == MatchOperand_ParseFail)
	return true;

	LLVM_DEBUG(dbgs() << ".. Generic Parser\n");

	switch (getLexer().getKind()) {
	case AsmToken::Dollar: {
	// Parse the register.
	SMLoc S = Parser.getTok().getLoc();

	// Almost all registers have been parsed by custom parsers. There is only
	// one exception to this. $zero (and it's alias $0) will reach this point
	// for div, divu, and similar instructions because it is not an operand
	// to the instruction definition but an explicit register. Special case
	// this situation for now.
	if (parseAnyRegister(Operands) != MatchOperand_NoMatch)
	return false;

	// Maybe it is a symbol reference.
	StringRef Identifier;
	if (Parser.parseIdentifier(Identifier))
	return true;

	SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
	MCSymbol *Sym = getContext().getOrCreateSymbol("$" + Identifier);
	// Otherwise create a symbol reference.
	const MCExpr *Res =
	MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());

	Operands.push_back(MipsOperand::CreateImm(Res, S, E, *this));
	return false;
	}
	default: {
	LLVM_DEBUG(dbgs() << ".. generic integer expression\n");

	const MCExpr *Expr;
	SMLoc S = Parser.getTok().getLoc(); // Start location of the operand.
	if (getParser().parseExpression(Expr))
	return true;

	SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);

	Operands.push_back(MipsOperand::CreateImm(Expr, S, E, *this));
	return false;
	}
	} // switch(getLexer().getKind())
	return true;
	}

	bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
	switch (Expr->getKind()) {
	case MCExpr::Constant:
	return true;
	case MCExpr::SymbolRef:
	return (cast<MCSymbolRefExpr>(Expr)->getKind() != MCSymbolRefExpr::VK_None);
	case MCExpr::Binary: {
	const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
	if (!isEvaluated(BE->getLHS()))
	return false;
	return isEvaluated(BE->getRHS());
	}
	case MCExpr::Unary:
	return isEvaluated(cast<MCUnaryExpr>(Expr)->getSubExpr());
	case MCExpr::Target:
	return true;
	}
	return false;
	}

	bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
	SMLoc &EndLoc) {
	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
	OperandMatchResultTy ResTy = parseAnyRegister(Operands);
	if (ResTy == MatchOperand_Success) {
	assert(Operands.size() == 1);
	MipsOperand &Operand = static_cast<MipsOperand &>(*Operands.front());
	StartLoc = Operand.getStartLoc();
	EndLoc = Operand.getEndLoc();

	// AFAIK, we only support numeric registers and named GPR's in CFI
	// directives.
	// Don't worry about eating tokens before failing. Using an unrecognised
	// register is a parse error.
	if (Operand.isGPRAsmReg()) {
	// Resolve to GPR32 or GPR64 appropriately.
	RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
	}

	return (RegNo == (unsigned)-1);
	}

	assert(Operands.size() == 0);
	return (RegNo == (unsigned)-1);
	}

	bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
	SMLoc S;

	if (isParenExpr)
	return getParser().parseParenExprOfDepth(0, Res, S);
	return getParser().parseExpression(Res);
	}

	OperandMatchResultTy
	MipsAsmParser::parseMemOperand(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	LLVM_DEBUG(dbgs() << "parseMemOperand\n");
	const MCExpr *IdVal = nullptr;
	SMLoc S;
	bool isParenExpr = false;
	OperandMatchResultTy Res = MatchOperand_NoMatch;
	// First operand is the offset.
	S = Parser.getTok().getLoc();

	if (getLexer().getKind() == AsmToken::LParen) {
	Parser.Lex();
	isParenExpr = true;
	}

	if (getLexer().getKind() != AsmToken::Dollar) {
	if (parseMemOffset(IdVal, isParenExpr))
	return MatchOperand_ParseFail;

	const AsmToken &Tok = Parser.getTok(); // Get the next token.
	if (Tok.isNot(AsmToken::LParen)) {
	MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]);
	if (Mnemonic.getToken() == "la" \|\| Mnemonic.getToken() == "dla") {
	SMLoc E =
	SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
	Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
	return MatchOperand_Success;
	}
	if (Tok.is(AsmToken::EndOfStatement)) {
	SMLoc E =
	SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);

	// Zero register assumed, add a memory operand with ZERO as its base.
	// "Base" will be managed by k_Memory.
	auto Base = MipsOperand::createGPRReg(
	0, "0", getContext().getRegisterInfo(), S, E, *this);
	Operands.push_back(
	MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this));
	return MatchOperand_Success;
	}
	MCBinaryExpr::Opcode Opcode;
	// GAS and LLVM treat comparison operators different. GAS will generate -1
	// or 0, while LLVM will generate 0 or 1. Since a comparsion operator is
	// highly unlikely to be found in a memory offset expression, we don't
	// handle them.
	switch (Tok.getKind()) {
	case AsmToken::Plus:
	Opcode = MCBinaryExpr::Add;
	Parser.Lex();
	break;
	case AsmToken::Minus:
	Opcode = MCBinaryExpr::Sub;
	Parser.Lex();
	break;
	case AsmToken::Star:
	Opcode = MCBinaryExpr::Mul;
	Parser.Lex();
	break;
	case AsmToken::Pipe:
	Opcode = MCBinaryExpr::Or;
	Parser.Lex();
	break;
	case AsmToken::Amp:
	Opcode = MCBinaryExpr::And;
	Parser.Lex();
	break;
	case AsmToken::LessLess:
	Opcode = MCBinaryExpr::Shl;
	Parser.Lex();
	break;
	case AsmToken::GreaterGreater:
	Opcode = MCBinaryExpr::LShr;
	Parser.Lex();
	break;
	case AsmToken::Caret:
	Opcode = MCBinaryExpr::Xor;
	Parser.Lex();
	break;
	case AsmToken::Slash:
	Opcode = MCBinaryExpr::Div;
	Parser.Lex();
	break;
	case AsmToken::Percent:
	Opcode = MCBinaryExpr::Mod;
	Parser.Lex();
	break;
	default:
	Error(Parser.getTok().getLoc(), "'(' or expression expected");
	return MatchOperand_ParseFail;
	}
	const MCExpr * NextExpr;
	if (getParser().parseExpression(NextExpr))
	return MatchOperand_ParseFail;
	IdVal = MCBinaryExpr::create(Opcode, IdVal, NextExpr, getContext());
	}

	Parser.Lex(); // Eat the '(' token.
	}

	Res = parseAnyRegister(Operands);
	if (Res != MatchOperand_Success)
	return Res;

	if (Parser.getTok().isNot(AsmToken::RParen)) {
	Error(Parser.getTok().getLoc(), "')' expected");
	return MatchOperand_ParseFail;
	}

	SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);

	Parser.Lex(); // Eat the ')' token.

	if (!IdVal)
	IdVal = MCConstantExpr::create(0, getContext());

	// Replace the register operand with the memory operand.
	std::unique_ptr<MipsOperand> op(
	static_cast<MipsOperand *>(Operands.back().release()));
	// Remove the register from the operands.
	// "op" will be managed by k_Memory.
	Operands.pop_back();
	// Add the memory operand.
	if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(IdVal)) {
	int64_t Imm;
	if (IdVal->evaluateAsAbsolute(Imm))
	IdVal = MCConstantExpr::create(Imm, getContext());
	else if (BE->getLHS()->getKind() != MCExpr::SymbolRef)
	IdVal = MCBinaryExpr::create(BE->getOpcode(), BE->getRHS(), BE->getLHS(),
	getContext());
	}

	Operands.push_back(MipsOperand::CreateMem(std::move(op), IdVal, S, E, *this));
	return MatchOperand_Success;
	}

	bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	MCSymbol *Sym = getContext().lookupSymbol(Parser.getTok().getIdentifier());
	if (!Sym)
	return false;

	SMLoc S = Parser.getTok().getLoc();
	if (Sym->isVariable()) {
	const MCExpr *Expr = Sym->getVariableValue();
	if (Expr->getKind() == MCExpr::SymbolRef) {
	const MCSymbolRefExpr Ref = static_cast<const MCSymbolRefExpr >(Expr);
	StringRef DefSymbol = Ref->getSymbol().getName();
	if (DefSymbol.startswith("$")) {
	OperandMatchResultTy ResTy =
	matchAnyRegisterNameWithoutDollar(Operands, DefSymbol.substr(1), S);
	if (ResTy == MatchOperand_Success) {
	Parser.Lex();
	return true;
	}
	if (ResTy == MatchOperand_ParseFail)
	llvm_unreachable("Should never ParseFail");
	}
	}
	} else if (Sym->isUnset()) {
	// If symbol is unset, it might be created in the `parseSetAssignment`
	// routine as an alias for a numeric register name.
	// Lookup in the aliases list.
	auto Entry = RegisterSets.find(Sym->getName());
	if (Entry != RegisterSets.end()) {
	OperandMatchResultTy ResTy =
	matchAnyRegisterWithoutDollar(Operands, Entry->getValue(), S);
	if (ResTy == MatchOperand_Success) {
	Parser.Lex();
	return true;
	}
	}
	}

	return false;
	}

	OperandMatchResultTy
	MipsAsmParser::matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
	StringRef Identifier,
	SMLoc S) {
	int Index = matchCPURegisterName(Identifier);
	if (Index != -1) {
	Operands.push_back(MipsOperand::createGPRReg(
	Index, Identifier, getContext().getRegisterInfo(), S,
	getLexer().getLoc(), *this));
	return MatchOperand_Success;
	}

	Index = matchHWRegsRegisterName(Identifier);
	if (Index != -1) {
	Operands.push_back(MipsOperand::createHWRegsReg(
	Index, Identifier, getContext().getRegisterInfo(), S,
	getLexer().getLoc(), *this));
	return MatchOperand_Success;
	}

	Index = matchFPURegisterName(Identifier);
	if (Index != -1) {
	Operands.push_back(MipsOperand::createFGRReg(
	Index, Identifier, getContext().getRegisterInfo(), S,
	getLexer().getLoc(), *this));
	return MatchOperand_Success;
	}

	Index = matchFCCRegisterName(Identifier);
	if (Index != -1) {
	Operands.push_back(MipsOperand::createFCCReg(
	Index, Identifier, getContext().getRegisterInfo(), S,
	getLexer().getLoc(), *this));
	return MatchOperand_Success;
	}

	Index = matchACRegisterName(Identifier);
	if (Index != -1) {
	Operands.push_back(MipsOperand::createACCReg(
	Index, Identifier, getContext().getRegisterInfo(), S,
	getLexer().getLoc(), *this));
	return MatchOperand_Success;
	}

	Index = matchMSA128RegisterName(Identifier);
	if (Index != -1) {
	Operands.push_back(MipsOperand::createMSA128Reg(
	Index, Identifier, getContext().getRegisterInfo(), S,
	getLexer().getLoc(), *this));
	return MatchOperand_Success;
	}

	Index = matchMSA128CtrlRegisterName(Identifier);
	if (Index != -1) {
	Operands.push_back(MipsOperand::createMSACtrlReg(
	Index, Identifier, getContext().getRegisterInfo(), S,
	getLexer().getLoc(), *this));
	return MatchOperand_Success;
	}

	return MatchOperand_NoMatch;
	}

	OperandMatchResultTy
	MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands,
	const AsmToken &Token, SMLoc S) {
	if (Token.is(AsmToken::Identifier)) {
	LLVM_DEBUG(dbgs() << ".. identifier\n");
	StringRef Identifier = Token.getIdentifier();
	OperandMatchResultTy ResTy =
	matchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
	return ResTy;
	} else if (Token.is(AsmToken::Integer)) {
	LLVM_DEBUG(dbgs() << ".. integer\n");
	int64_t RegNum = Token.getIntVal();
	if (RegNum < 0 \|\| RegNum > 31) {
	// Show the error, but treat invalid register
	// number as a normal one to continue parsing
	// and catch other possible errors.
	Error(getLexer().getLoc(), "invalid register number");
	}
	Operands.push_back(MipsOperand::createNumericReg(
	RegNum, Token.getString(), getContext().getRegisterInfo(), S,
	Token.getLoc(), *this));
	return MatchOperand_Success;
	}

	LLVM_DEBUG(dbgs() << Token.getKind() << "\n");

	return MatchOperand_NoMatch;
	}

	OperandMatchResultTy
	MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
	auto Token = getLexer().peekTok(false);
	return matchAnyRegisterWithoutDollar(Operands, Token, S);
	}

	OperandMatchResultTy
	MipsAsmParser::parseAnyRegister(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	LLVM_DEBUG(dbgs() << "parseAnyRegister\n");

	auto Token = Parser.getTok();

	SMLoc S = Token.getLoc();

	if (Token.isNot(AsmToken::Dollar)) {
	LLVM_DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
	if (Token.is(AsmToken::Identifier)) {
	if (searchSymbolAlias(Operands))
	return MatchOperand_Success;
	}
	LLVM_DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
	return MatchOperand_NoMatch;
	}
	LLVM_DEBUG(dbgs() << ".. $\n");

	OperandMatchResultTy ResTy = matchAnyRegisterWithoutDollar(Operands, S);
	if (ResTy == MatchOperand_Success) {
	Parser.Lex(); // $
	Parser.Lex(); // identifier
	}
	return ResTy;
	}

	OperandMatchResultTy
	MipsAsmParser::parseJumpTarget(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	LLVM_DEBUG(dbgs() << "parseJumpTarget\n");

	SMLoc S = getLexer().getLoc();

	// Registers are a valid target and have priority over symbols.
	OperandMatchResultTy ResTy = parseAnyRegister(Operands);
	if (ResTy != MatchOperand_NoMatch)
	return ResTy;

	// Integers and expressions are acceptable
	const MCExpr *Expr = nullptr;
	if (Parser.parseExpression(Expr)) {
	// We have no way of knowing if a symbol was consumed so we must ParseFail
	return MatchOperand_ParseFail;
	}
	Operands.push_back(
	MipsOperand::CreateImm(Expr, S, getLexer().getLoc(), *this));
	return MatchOperand_Success;
	}

	OperandMatchResultTy
	MipsAsmParser::parseInvNum(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	const MCExpr *IdVal;
	// If the first token is '$' we may have register operand. We have to reject
	// cases where it is not a register. Complicating the matter is that
	// register names are not reserved across all ABIs.
	// Peek past the dollar to see if it's a register name for this ABI.
	SMLoc S = Parser.getTok().getLoc();
	if (Parser.getTok().is(AsmToken::Dollar)) {
	return matchCPURegisterName(Parser.getLexer().peekTok().getString()) == -1
	? MatchOperand_ParseFail
	: MatchOperand_NoMatch;
	}
	if (getParser().parseExpression(IdVal))
	return MatchOperand_ParseFail;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal);
	if (!MCE)
	return MatchOperand_NoMatch;
	int64_t Val = MCE->getValue();
	SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
	Operands.push_back(MipsOperand::CreateImm(
	MCConstantExpr::create(0 - Val, getContext()), S, E, *this));
	return MatchOperand_Success;
	}

	OperandMatchResultTy
	MipsAsmParser::parseRegisterList(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	SmallVector<unsigned, 10> Regs;
	unsigned RegNo;
	unsigned PrevReg = Mips::NoRegister;
	bool RegRange = false;
	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;

	if (Parser.getTok().isNot(AsmToken::Dollar))
	return MatchOperand_ParseFail;

	SMLoc S = Parser.getTok().getLoc();
	while (parseAnyRegister(TmpOperands) == MatchOperand_Success) {
	SMLoc E = getLexer().getLoc();
	MipsOperand &Reg = static_cast<MipsOperand &>(*TmpOperands.back());
	RegNo = isGP64bit() ? Reg.getGPR64Reg() : Reg.getGPR32Reg();
	if (RegRange) {
	// Remove last register operand because registers from register range
	// should be inserted first.
	if ((isGP64bit() && RegNo == Mips::RA_64) \|\|
	(!isGP64bit() && RegNo == Mips::RA)) {
	Regs.push_back(RegNo);
	} else {
	unsigned TmpReg = PrevReg + 1;
	while (TmpReg <= RegNo) {
	if ((((TmpReg < Mips::S0) \|\| (TmpReg > Mips::S7)) && !isGP64bit()) \|\|
	(((TmpReg < Mips::S0_64) \|\| (TmpReg > Mips::S7_64)) &&
	isGP64bit())) {
	Error(E, "invalid register operand");
	return MatchOperand_ParseFail;
	}

	PrevReg = TmpReg;
	Regs.push_back(TmpReg++);
	}
	}

	RegRange = false;
	} else {
	if ((PrevReg == Mips::NoRegister) &&
	((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) \|\|
	(!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) {
	Error(E, "$16 or $31 expected");
	return MatchOperand_ParseFail;
	} else if (!(((RegNo == Mips::FP \|\| RegNo == Mips::RA \|\|
	(RegNo >= Mips::S0 && RegNo <= Mips::S7)) &&
	!isGP64bit()) \|\|
	((RegNo == Mips::FP_64 \|\| RegNo == Mips::RA_64 \|\|
	(RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) &&
	isGP64bit()))) {
	Error(E, "invalid register operand");
	return MatchOperand_ParseFail;
	} else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) &&
	((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) \|\|
	(RegNo != Mips::FP_64 && RegNo != Mips::RA_64 &&
	isGP64bit()))) {
	Error(E, "consecutive register numbers expected");
	return MatchOperand_ParseFail;
	}

	Regs.push_back(RegNo);
	}

	if (Parser.getTok().is(AsmToken::Minus))
	RegRange = true;

	if (!Parser.getTok().isNot(AsmToken::Minus) &&
	!Parser.getTok().isNot(AsmToken::Comma)) {
	Error(E, "',' or '-' expected");
	return MatchOperand_ParseFail;
	}

	Lex(); // Consume comma or minus
	if (Parser.getTok().isNot(AsmToken::Dollar))
	break;

	PrevReg = RegNo;
	}

	SMLoc E = Parser.getTok().getLoc();
	Operands.push_back(MipsOperand::CreateRegList(Regs, S, E, *this));
	parseMemOperand(Operands);
	return MatchOperand_Success;
	}

	/// Sometimes (i.e. load/stores) the operand may be followed immediately by
	/// either this.
	/// ::= '(', register, ')'
	/// handle it before we iterate so we don't get tripped up by the lack of
	/// a comma.
	bool MipsAsmParser::parseParenSuffix(StringRef Name, OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	if (getLexer().is(AsmToken::LParen)) {
	Operands.push_back(
	MipsOperand::CreateToken("(", getLexer().getLoc(), *this));
	Parser.Lex();
	if (parseOperand(Operands, Name)) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, "unexpected token in argument list");
	}
	if (Parser.getTok().isNot(AsmToken::RParen)) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, "unexpected token, expected ')'");
	}
	Operands.push_back(
	MipsOperand::CreateToken(")", getLexer().getLoc(), *this));
	Parser.Lex();
	}
	return false;
	}

	/// Sometimes (i.e. in MSA) the operand may be followed immediately by
	/// either one of these.
	/// ::= '[', register, ']'
	/// ::= '[', integer, ']'
	/// handle it before we iterate so we don't get tripped up by the lack of
	/// a comma.
	bool MipsAsmParser::parseBracketSuffix(StringRef Name,
	OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	if (getLexer().is(AsmToken::LBrac)) {
	Operands.push_back(
	MipsOperand::CreateToken("[", getLexer().getLoc(), *this));
	Parser.Lex();
	if (parseOperand(Operands, Name)) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, "unexpected token in argument list");
	}
	if (Parser.getTok().isNot(AsmToken::RBrac)) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, "unexpected token, expected ']'");
	}
	Operands.push_back(
	MipsOperand::CreateToken("]", getLexer().getLoc(), *this));
	Parser.Lex();
	}
	return false;
	}

	static std::string MipsMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS,
	unsigned VariantID = 0);

	bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
	SMLoc NameLoc, OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	LLVM_DEBUG(dbgs() << "ParseInstruction\n");

	// We have reached first instruction, module directive are now forbidden.
	getTargetStreamer().forbidModuleDirective();

	// Check if we have valid mnemonic
	if (!mnemonicIsValid(Name, 0)) {
	FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
	std::string Suggestion = MipsMnemonicSpellCheck(Name, FBS);
	return Error(NameLoc, "unknown instruction" + Suggestion);
	}
	// First operand in MCInst is instruction mnemonic.
	Operands.push_back(MipsOperand::CreateToken(Name, NameLoc, *this));

	// Read the remaining operands.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	// Read the first operand.
	if (parseOperand(Operands, Name)) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, "unexpected token in argument list");
	}
	if (getLexer().is(AsmToken::LBrac) && parseBracketSuffix(Name, Operands))
	return true;
	// AFAIK, parenthesis suffixes are never on the first operand

	while (getLexer().is(AsmToken::Comma)) {
	Parser.Lex(); // Eat the comma.
	// Parse and remember the operand.
	if (parseOperand(Operands, Name)) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, "unexpected token in argument list");
	}
	// Parse bracket and parenthesis suffixes before we iterate
	if (getLexer().is(AsmToken::LBrac)) {
	if (parseBracketSuffix(Name, Operands))
	return true;
	} else if (getLexer().is(AsmToken::LParen) &&
	parseParenSuffix(Name, Operands))
	return true;
	}
	}
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, "unexpected token in argument list");
	}
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	// FIXME: Given that these have the same name, these should both be
	// consistent on affecting the Parser.
	bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, ErrorMsg);
	}

	bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) {
	return Error(Loc, ErrorMsg);
	}

	bool MipsAsmParser::parseSetNoAtDirective() {
	MCAsmParser &Parser = getParser();
	// Line should look like: ".set noat".

	// Set the $at register to $0.
	AssemblerOptions.back()->setATRegIndex(0);

	Parser.Lex(); // Eat "noat".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	getTargetStreamer().emitDirectiveSetNoAt();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetAtDirective() {
	// Line can be: ".set at", which sets $at to $1
	// or ".set at=$reg", which sets $at to $reg.
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "at".

	if (getLexer().is(AsmToken::EndOfStatement)) {
	// No register was specified, so we set $at to $1.
	AssemblerOptions.back()->setATRegIndex(1);

	getTargetStreamer().emitDirectiveSetAt();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	if (getLexer().isNot(AsmToken::Equal)) {
	reportParseError("unexpected token, expected equals sign");
	return false;
	}
	Parser.Lex(); // Eat "=".

	if (getLexer().isNot(AsmToken::Dollar)) {
	if (getLexer().is(AsmToken::EndOfStatement)) {
	reportParseError("no register specified");
	return false;
	} else {
	reportParseError("unexpected token, expected dollar sign '$'");
	return false;
	}
	}
	Parser.Lex(); // Eat "$".

	// Find out what "reg" is.
	unsigned AtRegNo;
	const AsmToken &Reg = Parser.getTok();
	if (Reg.is(AsmToken::Identifier)) {
	AtRegNo = matchCPURegisterName(Reg.getIdentifier());
	} else if (Reg.is(AsmToken::Integer)) {
	AtRegNo = Reg.getIntVal();
	} else {
	reportParseError("unexpected token, expected identifier or integer");
	return false;
	}

	// Check if $reg is a valid register. If it is, set $at to $reg.
	if (!AssemblerOptions.back()->setATRegIndex(AtRegNo)) {
	reportParseError("invalid register");
	return false;
	}
	Parser.Lex(); // Eat "reg".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	getTargetStreamer().emitDirectiveSetAtWithArg(AtRegNo);

	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetReorderDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}
	AssemblerOptions.back()->setReorder();
	getTargetStreamer().emitDirectiveSetReorder();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetNoReorderDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}
	AssemblerOptions.back()->setNoReorder();
	getTargetStreamer().emitDirectiveSetNoReorder();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetMacroDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}
	AssemblerOptions.back()->setMacro();
	getTargetStreamer().emitDirectiveSetMacro();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetNoMacroDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}
	if (AssemblerOptions.back()->isReorder()) {
	reportParseError("`noreorder' must be set before `nomacro'");
	return false;
	}
	AssemblerOptions.back()->setNoMacro();
	getTargetStreamer().emitDirectiveSetNoMacro();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetMsaDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return reportParseError("unexpected token, expected end of statement");

	setFeatureBits(Mips::FeatureMSA, "msa");
	getTargetStreamer().emitDirectiveSetMsa();
	return false;
	}

	bool MipsAsmParser::parseSetNoMsaDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return reportParseError("unexpected token, expected end of statement");

	clearFeatureBits(Mips::FeatureMSA, "msa");
	getTargetStreamer().emitDirectiveSetNoMsa();
	return false;
	}

	bool MipsAsmParser::parseSetNoDspDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "nodsp".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	clearFeatureBits(Mips::FeatureDSP, "dsp");
	getTargetStreamer().emitDirectiveSetNoDsp();
	return false;
	}

	bool MipsAsmParser::parseSetMips16Directive() {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "mips16".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	setFeatureBits(Mips::FeatureMips16, "mips16");
	getTargetStreamer().emitDirectiveSetMips16();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetNoMips16Directive() {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "nomips16".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	clearFeatureBits(Mips::FeatureMips16, "mips16");
	getTargetStreamer().emitDirectiveSetNoMips16();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetFpDirective() {
	MCAsmParser &Parser = getParser();
	MipsABIFlagsSection::FpABIKind FpAbiVal;
	// Line can be: .set fp=32
	// .set fp=xx
	// .set fp=64
	Parser.Lex(); // Eat fp token
	AsmToken Tok = Parser.getTok();
	if (Tok.isNot(AsmToken::Equal)) {
	reportParseError("unexpected token, expected equals sign '='");
	return false;
	}
	Parser.Lex(); // Eat '=' token.
	Tok = Parser.getTok();

	if (!parseFpABIValue(FpAbiVal, ".set"))
	return false;

	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}
	getTargetStreamer().emitDirectiveSetFp(FpAbiVal);
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetOddSPRegDirective() {
	MCAsmParser &Parser = getParser();

	Parser.Lex(); // Eat "oddspreg".
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
	getTargetStreamer().emitDirectiveSetOddSPReg();
	return false;
	}

	bool MipsAsmParser::parseSetNoOddSPRegDirective() {
	MCAsmParser &Parser = getParser();

	Parser.Lex(); // Eat "nooddspreg".
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
	getTargetStreamer().emitDirectiveSetNoOddSPReg();
	return false;
	}

	bool MipsAsmParser::parseSetMtDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "mt".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	setFeatureBits(Mips::FeatureMT, "mt");
	getTargetStreamer().emitDirectiveSetMt();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetNoMtDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "nomt".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	clearFeatureBits(Mips::FeatureMT, "mt");

	getTargetStreamer().emitDirectiveSetNoMt();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetNoCRCDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "nocrc".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	clearFeatureBits(Mips::FeatureCRC, "crc");

	getTargetStreamer().emitDirectiveSetNoCRC();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetNoVirtDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "novirt".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	clearFeatureBits(Mips::FeatureVirt, "virt");

	getTargetStreamer().emitDirectiveSetNoVirt();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetNoGINVDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat "noginv".

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	clearFeatureBits(Mips::FeatureGINV, "ginv");

	getTargetStreamer().emitDirectiveSetNoGINV();
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseSetPopDirective() {
	MCAsmParser &Parser = getParser();
	SMLoc Loc = getLexer().getLoc();

	Parser.Lex();
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return reportParseError("unexpected token, expected end of statement");

	// Always keep an element on the options "stack" to prevent the user
	// from changing the initial options. This is how we remember them.
	if (AssemblerOptions.size() == 2)
	return reportParseError(Loc, ".set pop with no .set push");

	MCSubtargetInfo &STI = copySTI();
	AssemblerOptions.pop_back();
	setAvailableFeatures(
	ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures()));
	STI.setFeatureBits(AssemblerOptions.back()->getFeatures());

	getTargetStreamer().emitDirectiveSetPop();
	return false;
	}

	bool MipsAsmParser::parseSetPushDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return reportParseError("unexpected token, expected end of statement");

	// Create a copy of the current assembler options environment and push it.
	AssemblerOptions.push_back(
	llvm::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));

	getTargetStreamer().emitDirectiveSetPush();
	return false;
	}

	bool MipsAsmParser::parseSetSoftFloatDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return reportParseError("unexpected token, expected end of statement");

	setFeatureBits(Mips::FeatureSoftFloat, "soft-float");
	getTargetStreamer().emitDirectiveSetSoftFloat();
	return false;
	}

	bool MipsAsmParser::parseSetHardFloatDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return reportParseError("unexpected token, expected end of statement");

	clearFeatureBits(Mips::FeatureSoftFloat, "soft-float");
	getTargetStreamer().emitDirectiveSetHardFloat();
	return false;
	}

	bool MipsAsmParser::parseSetAssignment() {
	StringRef Name;
	MCAsmParser &Parser = getParser();

	if (Parser.parseIdentifier(Name))
	return reportParseError("expected identifier after .set");

	if (getLexer().isNot(AsmToken::Comma))
	return reportParseError("unexpected token, expected comma");
	Lex(); // Eat comma

	if (getLexer().is(AsmToken::Dollar) &&
	getLexer().peekTok().is(AsmToken::Integer)) {
	// Parse assignment of a numeric register:
	// .set r1,$1
	Parser.Lex(); // Eat $.
	RegisterSets[Name] = Parser.getTok();
	Parser.Lex(); // Eat identifier.
	getContext().getOrCreateSymbol(Name);
	return false;
	}

	MCSymbol *Sym;
	const MCExpr *Value;
	if (MCParserUtils::parseAssignmentExpression(Name, /* allow_redef */ true,
	Parser, Sym, Value))
	return true;
	Sym->setVariableValue(Value);

	return false;
	}

	bool MipsAsmParser::parseSetMips0Directive() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return reportParseError("unexpected token, expected end of statement");

	// Reset assembler options to their initial values.
	MCSubtargetInfo &STI = copySTI();
	setAvailableFeatures(
	ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures()));
	STI.setFeatureBits(AssemblerOptions.front()->getFeatures());
	AssemblerOptions.back()->setFeatures(AssemblerOptions.front()->getFeatures());

	getTargetStreamer().emitDirectiveSetMips0();
	return false;
	}

	bool MipsAsmParser::parseSetArchDirective() {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	if (getLexer().isNot(AsmToken::Equal))
	return reportParseError("unexpected token, expected equals sign");

	Parser.Lex();
	StringRef Arch;
	if (Parser.parseIdentifier(Arch))
	return reportParseError("expected arch identifier");

	StringRef ArchFeatureName =
	StringSwitch<StringRef>(Arch)
	.Case("mips1", "mips1")
	.Case("mips2", "mips2")
	.Case("mips3", "mips3")
	.Case("mips4", "mips4")
	.Case("mips5", "mips5")
	.Case("mips32", "mips32")
	.Case("mips32r2", "mips32r2")
	.Case("mips32r3", "mips32r3")
	.Case("mips32r5", "mips32r5")
	.Case("mips32r6", "mips32r6")
	.Case("mips64", "mips64")
	.Case("mips64r2", "mips64r2")
	.Case("mips64r3", "mips64r3")
	.Case("mips64r5", "mips64r5")
	.Case("mips64r6", "mips64r6")
	.Case("octeon", "cnmips")
	.Case("r4000", "mips3") // This is an implementation of Mips3.
	.Default("");

	if (ArchFeatureName.empty())
	return reportParseError("unsupported architecture");

	if (ArchFeatureName == "mips64r6" && inMicroMipsMode())
	return reportParseError("mips64r6 does not support microMIPS");

	selectArch(ArchFeatureName);
	getTargetStreamer().emitDirectiveSetArch(Arch);
	return false;
	}

	bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
	MCAsmParser &Parser = getParser();
	Parser.Lex();
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return reportParseError("unexpected token, expected end of statement");

	switch (Feature) {
	default:
	llvm_unreachable("Unimplemented feature");
	case Mips::FeatureDSP:
	setFeatureBits(Mips::FeatureDSP, "dsp");
	getTargetStreamer().emitDirectiveSetDsp();
	break;
	case Mips::FeatureDSPR2:
	setFeatureBits(Mips::FeatureDSPR2, "dspr2");
	getTargetStreamer().emitDirectiveSetDspr2();
	break;
	case Mips::FeatureMicroMips:
	setFeatureBits(Mips::FeatureMicroMips, "micromips");
	getTargetStreamer().emitDirectiveSetMicroMips();
	break;
	case Mips::FeatureMips1:
	selectArch("mips1");
	getTargetStreamer().emitDirectiveSetMips1();
	break;
	case Mips::FeatureMips2:
	selectArch("mips2");
	getTargetStreamer().emitDirectiveSetMips2();
	break;
	case Mips::FeatureMips3:
	selectArch("mips3");
	getTargetStreamer().emitDirectiveSetMips3();
	break;
	case Mips::FeatureMips4:
	selectArch("mips4");
	getTargetStreamer().emitDirectiveSetMips4();
	break;
	case Mips::FeatureMips5:
	selectArch("mips5");
	getTargetStreamer().emitDirectiveSetMips5();
	break;
	case Mips::FeatureMips32:
	selectArch("mips32");
	getTargetStreamer().emitDirectiveSetMips32();
	break;
	case Mips::FeatureMips32r2:
	selectArch("mips32r2");
	getTargetStreamer().emitDirectiveSetMips32R2();
	break;
	case Mips::FeatureMips32r3:
	selectArch("mips32r3");
	getTargetStreamer().emitDirectiveSetMips32R3();
	break;
	case Mips::FeatureMips32r5:
	selectArch("mips32r5");
	getTargetStreamer().emitDirectiveSetMips32R5();
	break;
	case Mips::FeatureMips32r6:
	selectArch("mips32r6");
	getTargetStreamer().emitDirectiveSetMips32R6();
	break;
	case Mips::FeatureMips64:
	selectArch("mips64");
	getTargetStreamer().emitDirectiveSetMips64();
	break;
	case Mips::FeatureMips64r2:
	selectArch("mips64r2");
	getTargetStreamer().emitDirectiveSetMips64R2();
	break;
	case Mips::FeatureMips64r3:
	selectArch("mips64r3");
	getTargetStreamer().emitDirectiveSetMips64R3();
	break;
	case Mips::FeatureMips64r5:
	selectArch("mips64r5");
	getTargetStreamer().emitDirectiveSetMips64R5();
	break;
	case Mips::FeatureMips64r6:
	selectArch("mips64r6");
	getTargetStreamer().emitDirectiveSetMips64R6();
	break;
	case Mips::FeatureCRC:
	setFeatureBits(Mips::FeatureCRC, "crc");
	getTargetStreamer().emitDirectiveSetCRC();
	break;
	case Mips::FeatureVirt:
	setFeatureBits(Mips::FeatureVirt, "virt");
	getTargetStreamer().emitDirectiveSetVirt();
	break;
	case Mips::FeatureGINV:
	setFeatureBits(Mips::FeatureGINV, "ginv");
	getTargetStreamer().emitDirectiveSetGINV();
	break;
	}
	return false;
	}

	bool MipsAsmParser::eatComma(StringRef ErrorStr) {
	MCAsmParser &Parser = getParser();
	if (getLexer().isNot(AsmToken::Comma)) {
	SMLoc Loc = getLexer().getLoc();
	return Error(Loc, ErrorStr);
	}

	Parser.Lex(); // Eat the comma.
	return true;
	}

	// Used to determine if .cpload, .cprestore, and .cpsetup have any effect.
	// In this class, it is only used for .cprestore.
	// FIXME: Only keep track of IsPicEnabled in one place, instead of in both
	// MipsTargetELFStreamer and MipsAsmParser.
	bool MipsAsmParser::isPicAndNotNxxAbi() {
	return inPicMode() && !(isABI_N32() \|\| isABI_N64());
	}

	bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
	if (AssemblerOptions.back()->isReorder())
	Warning(Loc, ".cpload should be inside a noreorder section");

	if (inMips16Mode()) {
	reportParseError(".cpload is not supported in Mips16 mode");
	return false;
	}

	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
	OperandMatchResultTy ResTy = parseAnyRegister(Reg);
	if (ResTy == MatchOperand_NoMatch \|\| ResTy == MatchOperand_ParseFail) {
	reportParseError("expected register containing function address");
	return false;
	}

	MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
	if (!RegOpnd.isGPRAsmReg()) {
	reportParseError(RegOpnd.getStartLoc(), "invalid register");
	return false;
	}

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	getTargetStreamer().emitDirectiveCpLoad(RegOpnd.getGPR32Reg());
	return false;
	}

	bool MipsAsmParser::parseDirectiveCpLocal(SMLoc Loc) {
	if (!isABI_N32() && !isABI_N64()) {
	reportParseError(".cplocal is allowed only in N32 or N64 mode");
	return false;
	}

	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
	OperandMatchResultTy ResTy = parseAnyRegister(Reg);
	if (ResTy == MatchOperand_NoMatch \|\| ResTy == MatchOperand_ParseFail) {
	reportParseError("expected register containing global pointer");
	return false;
	}

	MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
	if (!RegOpnd.isGPRAsmReg()) {
	reportParseError(RegOpnd.getStartLoc(), "invalid register");
	return false;
	}

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}
	getParser().Lex(); // Consume the EndOfStatement.

	unsigned NewReg = RegOpnd.getGPR32Reg();
	if (IsPicEnabled)
	GPReg = NewReg;

	getTargetStreamer().emitDirectiveCpLocal(NewReg);
	return false;
	}

	bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) {
	MCAsmParser &Parser = getParser();

	// Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
	// is used in non-PIC mode.

	if (inMips16Mode()) {
	reportParseError(".cprestore is not supported in Mips16 mode");
	return false;
	}

	// Get the stack offset value.
	const MCExpr *StackOffset;
	int64_t StackOffsetVal;
	if (Parser.parseExpression(StackOffset)) {
	reportParseError("expected stack offset value");
	return false;
	}

	if (!StackOffset->evaluateAsAbsolute(StackOffsetVal)) {
	reportParseError("stack offset is not an absolute expression");
	return false;
	}

	if (StackOffsetVal < 0) {
	Warning(Loc, ".cprestore with negative stack offset has no effect");
	IsCpRestoreSet = false;
	} else {
	IsCpRestoreSet = true;
	CpRestoreOffset = StackOffsetVal;
	}

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	if (!getTargetStreamer().emitDirectiveCpRestore(
	CpRestoreOffset, [&]() { return getATReg(Loc); }, Loc, STI))
	return true;
	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseDirectiveCPSetup() {
	MCAsmParser &Parser = getParser();
	unsigned FuncReg;
	unsigned Save;
	bool SaveIsReg = true;

	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
	OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
	if (ResTy == MatchOperand_NoMatch) {
	reportParseError("expected register containing function address");
	return false;
	}

	MipsOperand &FuncRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
	if (!FuncRegOpnd.isGPRAsmReg()) {
	reportParseError(FuncRegOpnd.getStartLoc(), "invalid register");
	return false;
	}

	FuncReg = FuncRegOpnd.getGPR32Reg();
	TmpReg.clear();

	if (!eatComma("unexpected token, expected comma"))
	return true;

	ResTy = parseAnyRegister(TmpReg);
	if (ResTy == MatchOperand_NoMatch) {
	const MCExpr *OffsetExpr;
	int64_t OffsetVal;
	SMLoc ExprLoc = getLexer().getLoc();

	if (Parser.parseExpression(OffsetExpr) \|\|
	!OffsetExpr->evaluateAsAbsolute(OffsetVal)) {
	reportParseError(ExprLoc, "expected save register or stack offset");
	return false;
	}

	Save = OffsetVal;
	SaveIsReg = false;
	} else {
	MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
	if (!SaveOpnd.isGPRAsmReg()) {
	reportParseError(SaveOpnd.getStartLoc(), "invalid register");
	return false;
	}
	Save = SaveOpnd.getGPR32Reg();
	}

	if (!eatComma("unexpected token, expected comma"))
	return true;

	const MCExpr *Expr;
	if (Parser.parseExpression(Expr)) {
	reportParseError("expected expression");
	return false;
	}

	if (Expr->getKind() != MCExpr::SymbolRef) {
	reportParseError("expected symbol");
	return false;
	}
	const MCSymbolRefExpr Ref = static_cast<const MCSymbolRefExpr >(Expr);

	CpSaveLocation = Save;
	CpSaveLocationIsRegister = SaveIsReg;

	getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(),
	SaveIsReg);
	return false;
	}

	bool MipsAsmParser::parseDirectiveCPReturn() {
	getTargetStreamer().emitDirectiveCpreturn(CpSaveLocation,
	CpSaveLocationIsRegister);
	return false;
	}

	bool MipsAsmParser::parseDirectiveNaN() {
	MCAsmParser &Parser = getParser();
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	const AsmToken &Tok = Parser.getTok();

	if (Tok.getString() == "2008") {
	Parser.Lex();
	getTargetStreamer().emitDirectiveNaN2008();
	return false;
	} else if (Tok.getString() == "legacy") {
	Parser.Lex();
	getTargetStreamer().emitDirectiveNaNLegacy();
	return false;
	}
	}
	// If we don't recognize the option passed to the .nan
	// directive (e.g. no option or unknown option), emit an error.
	reportParseError("invalid option in .nan directive");
	return false;
	}

	bool MipsAsmParser::parseDirectiveSet() {
	const AsmToken &Tok = getParser().getTok();
	StringRef IdVal = Tok.getString();
	SMLoc Loc = Tok.getLoc();

	if (IdVal == "noat")
	return parseSetNoAtDirective();
	if (IdVal == "at")
	return parseSetAtDirective();
	if (IdVal == "arch")
	return parseSetArchDirective();
	if (IdVal == "bopt") {
	Warning(Loc, "'bopt' feature is unsupported");
	getParser().Lex();
	return false;
	}
	if (IdVal == "nobopt") {
	// We're already running in nobopt mode, so nothing to do.
	getParser().Lex();
	return false;
	}
	if (IdVal == "fp")
	return parseSetFpDirective();
	if (IdVal == "oddspreg")
	return parseSetOddSPRegDirective();
	if (IdVal == "nooddspreg")
	return parseSetNoOddSPRegDirective();
	if (IdVal == "pop")
	return parseSetPopDirective();
	if (IdVal == "push")
	return parseSetPushDirective();
	if (IdVal == "reorder")
	return parseSetReorderDirective();
	if (IdVal == "noreorder")
	return parseSetNoReorderDirective();
	if (IdVal == "macro")
	return parseSetMacroDirective();
	if (IdVal == "nomacro")
	return parseSetNoMacroDirective();
	if (IdVal == "mips16")
	return parseSetMips16Directive();
	if (IdVal == "nomips16")
	return parseSetNoMips16Directive();
	if (IdVal == "nomicromips") {
	clearFeatureBits(Mips::FeatureMicroMips, "micromips");
	getTargetStreamer().emitDirectiveSetNoMicroMips();
	getParser().eatToEndOfStatement();
	return false;
	}
	if (IdVal == "micromips") {
	if (hasMips64r6()) {
	Error(Loc, ".set micromips directive is not supported with MIPS64R6");
	return false;
	}
	return parseSetFeature(Mips::FeatureMicroMips);
	}
	if (IdVal == "mips0")
	return parseSetMips0Directive();
	if (IdVal == "mips1")
	return parseSetFeature(Mips::FeatureMips1);
	if (IdVal == "mips2")
	return parseSetFeature(Mips::FeatureMips2);
	if (IdVal == "mips3")
	return parseSetFeature(Mips::FeatureMips3);
	if (IdVal == "mips4")
	return parseSetFeature(Mips::FeatureMips4);
	if (IdVal == "mips5")
	return parseSetFeature(Mips::FeatureMips5);
	if (IdVal == "mips32")
	return parseSetFeature(Mips::FeatureMips32);
	if (IdVal == "mips32r2")
	return parseSetFeature(Mips::FeatureMips32r2);
	if (IdVal == "mips32r3")
	return parseSetFeature(Mips::FeatureMips32r3);
	if (IdVal == "mips32r5")
	return parseSetFeature(Mips::FeatureMips32r5);
	if (IdVal == "mips32r6")
	return parseSetFeature(Mips::FeatureMips32r6);
	if (IdVal == "mips64")
	return parseSetFeature(Mips::FeatureMips64);
	if (IdVal == "mips64r2")
	return parseSetFeature(Mips::FeatureMips64r2);
	if (IdVal == "mips64r3")
	return parseSetFeature(Mips::FeatureMips64r3);
	if (IdVal == "mips64r5")
	return parseSetFeature(Mips::FeatureMips64r5);
	if (IdVal == "mips64r6") {
	if (inMicroMipsMode()) {
	Error(Loc, "MIPS64R6 is not supported with microMIPS");
	return false;
	}
	return parseSetFeature(Mips::FeatureMips64r6);
	}
	if (IdVal == "dsp")
	return parseSetFeature(Mips::FeatureDSP);
	if (IdVal == "dspr2")
	return parseSetFeature(Mips::FeatureDSPR2);
	if (IdVal == "nodsp")
	return parseSetNoDspDirective();
	if (IdVal == "msa")
	return parseSetMsaDirective();
	if (IdVal == "nomsa")
	return parseSetNoMsaDirective();
	if (IdVal == "mt")
	return parseSetMtDirective();
	if (IdVal == "nomt")
	return parseSetNoMtDirective();
	if (IdVal == "softfloat")
	return parseSetSoftFloatDirective();
	if (IdVal == "hardfloat")
	return parseSetHardFloatDirective();
	if (IdVal == "crc")
	return parseSetFeature(Mips::FeatureCRC);
	if (IdVal == "nocrc")
	return parseSetNoCRCDirective();
	if (IdVal == "virt")
	return parseSetFeature(Mips::FeatureVirt);
	if (IdVal == "novirt")
	return parseSetNoVirtDirective();
	if (IdVal == "ginv")
	return parseSetFeature(Mips::FeatureGINV);
	if (IdVal == "noginv")
	return parseSetNoGINVDirective();

	// It is just an identifier, look for an assignment.
	return parseSetAssignment();
	}

	/// parseDirectiveGpWord
	/// ::= .gpword local_sym
	bool MipsAsmParser::parseDirectiveGpWord() {
	MCAsmParser &Parser = getParser();
	const MCExpr *Value;
	// EmitGPRel32Value requires an expression, so we are using base class
	// method to evaluate the expression.
	if (getParser().parseExpression(Value))
	return true;
	getParser().getStreamer().EmitGPRel32Value(Value);

	if (getLexer().isNot(AsmToken::EndOfStatement))
	return Error(getLexer().getLoc(),
	"unexpected token, expected end of statement");
	Parser.Lex(); // Eat EndOfStatement token.
	return false;
	}

	/// parseDirectiveGpDWord
	/// ::= .gpdword local_sym
	bool MipsAsmParser::parseDirectiveGpDWord() {
	MCAsmParser &Parser = getParser();
	const MCExpr *Value;
	// EmitGPRel64Value requires an expression, so we are using base class
	// method to evaluate the expression.
	if (getParser().parseExpression(Value))
	return true;
	getParser().getStreamer().EmitGPRel64Value(Value);

	if (getLexer().isNot(AsmToken::EndOfStatement))
	return Error(getLexer().getLoc(),
	"unexpected token, expected end of statement");
	Parser.Lex(); // Eat EndOfStatement token.
	return false;
	}

	/// parseDirectiveDtpRelWord
	/// ::= .dtprelword tls_sym
	bool MipsAsmParser::parseDirectiveDtpRelWord() {
	MCAsmParser &Parser = getParser();
	const MCExpr *Value;
	// EmitDTPRel32Value requires an expression, so we are using base class
	// method to evaluate the expression.
	if (getParser().parseExpression(Value))
	return true;
	getParser().getStreamer().EmitDTPRel32Value(Value);

	if (getLexer().isNot(AsmToken::EndOfStatement))
	return Error(getLexer().getLoc(),
	"unexpected token, expected end of statement");
	Parser.Lex(); // Eat EndOfStatement token.
	return false;
	}

	/// parseDirectiveDtpRelDWord
	/// ::= .dtpreldword tls_sym
	bool MipsAsmParser::parseDirectiveDtpRelDWord() {
	MCAsmParser &Parser = getParser();
	const MCExpr *Value;
	// EmitDTPRel64Value requires an expression, so we are using base class
	// method to evaluate the expression.
	if (getParser().parseExpression(Value))
	return true;
	getParser().getStreamer().EmitDTPRel64Value(Value);

	if (getLexer().isNot(AsmToken::EndOfStatement))
	return Error(getLexer().getLoc(),
	"unexpected token, expected end of statement");
	Parser.Lex(); // Eat EndOfStatement token.
	return false;
	}

	/// parseDirectiveTpRelWord
	/// ::= .tprelword tls_sym
	bool MipsAsmParser::parseDirectiveTpRelWord() {
	MCAsmParser &Parser = getParser();
	const MCExpr *Value;
	// EmitTPRel32Value requires an expression, so we are using base class
	// method to evaluate the expression.
	if (getParser().parseExpression(Value))
	return true;
	getParser().getStreamer().EmitTPRel32Value(Value);

	if (getLexer().isNot(AsmToken::EndOfStatement))
	return Error(getLexer().getLoc(),
	"unexpected token, expected end of statement");
	Parser.Lex(); // Eat EndOfStatement token.
	return false;
	}

	/// parseDirectiveTpRelDWord
	/// ::= .tpreldword tls_sym
	bool MipsAsmParser::parseDirectiveTpRelDWord() {
	MCAsmParser &Parser = getParser();
	const MCExpr *Value;
	// EmitTPRel64Value requires an expression, so we are using base class
	// method to evaluate the expression.
	if (getParser().parseExpression(Value))
	return true;
	getParser().getStreamer().EmitTPRel64Value(Value);

	if (getLexer().isNot(AsmToken::EndOfStatement))
	return Error(getLexer().getLoc(),
	"unexpected token, expected end of statement");
	Parser.Lex(); // Eat EndOfStatement token.
	return false;
	}

	bool MipsAsmParser::parseDirectiveOption() {
	MCAsmParser &Parser = getParser();
	// Get the option token.
	AsmToken Tok = Parser.getTok();
	// At the moment only identifiers are supported.
	if (Tok.isNot(AsmToken::Identifier)) {
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected identifier");
	}

	StringRef Option = Tok.getIdentifier();

	if (Option == "pic0") {
	// MipsAsmParser needs to know if the current PIC mode changes.
	IsPicEnabled = false;

	getTargetStreamer().emitDirectiveOptionPic0();
	Parser.Lex();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");
	}
	return false;
	}

	if (Option == "pic2") {
	// MipsAsmParser needs to know if the current PIC mode changes.
	IsPicEnabled = true;

	getTargetStreamer().emitDirectiveOptionPic2();
	Parser.Lex();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");
	}
	return false;
	}

	// Unknown option.
	Warning(Parser.getTok().getLoc(),
	"unknown option, expected 'pic0' or 'pic2'");
	Parser.eatToEndOfStatement();
	return false;
	}

	/// parseInsnDirective
	/// ::= .insn
	bool MipsAsmParser::parseInsnDirective() {
	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	// The actual label marking happens in
	// MipsELFStreamer::createPendingLabelRelocs().
	getTargetStreamer().emitDirectiveInsn();

	getParser().Lex(); // Eat EndOfStatement token.
	return false;
	}

	/// parseRSectionDirective
	/// ::= .rdata
	bool MipsAsmParser::parseRSectionDirective(StringRef Section) {
	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	MCSection *ELFSection = getContext().getELFSection(
	Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
	getParser().getStreamer().SwitchSection(ELFSection);

	getParser().Lex(); // Eat EndOfStatement token.
	return false;
	}

	/// parseSSectionDirective
	/// ::= .sbss
	/// ::= .sdata
	bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) {
	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	MCSection *ELFSection = getContext().getELFSection(
	Section, Type, ELF::SHF_WRITE \| ELF::SHF_ALLOC \| ELF::SHF_MIPS_GPREL);
	getParser().getStreamer().SwitchSection(ELFSection);

	getParser().Lex(); // Eat EndOfStatement token.
	return false;
	}

	/// parseDirectiveModule
	/// ::= .module oddspreg
	/// ::= .module nooddspreg
	/// ::= .module fp=value
	/// ::= .module softfloat
	/// ::= .module hardfloat
	/// ::= .module mt
	/// ::= .module crc
	/// ::= .module nocrc
	/// ::= .module virt
	/// ::= .module novirt
	/// ::= .module ginv
	/// ::= .module noginv
	bool MipsAsmParser::parseDirectiveModule() {
	MCAsmParser &Parser = getParser();
	MCAsmLexer &Lexer = getLexer();
	SMLoc L = Lexer.getLoc();

	if (!getTargetStreamer().isModuleDirectiveAllowed()) {
	// TODO : get a better message.
	reportParseError(".module directive must appear before any code");
	return false;
	}

	StringRef Option;
	if (Parser.parseIdentifier(Option)) {
	reportParseError("expected .module option identifier");
	return false;
	}

	if (Option == "oddspreg") {
	clearModuleFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");

	// Synchronize the abiflags information with the FeatureBits information we
	// changed above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated abiflags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted at the end).
	getTargetStreamer().emitDirectiveModuleOddSPReg();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "nooddspreg") {
	if (!isABI_O32()) {
	return Error(L, "'.module nooddspreg' requires the O32 ABI");
	}

	setModuleFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");

	// Synchronize the abiflags information with the FeatureBits information we
	// changed above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated abiflags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted at the end).
	getTargetStreamer().emitDirectiveModuleOddSPReg();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "fp") {
	return parseDirectiveModuleFP();
	} else if (Option == "softfloat") {
	setModuleFeatureBits(Mips::FeatureSoftFloat, "soft-float");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleSoftFloat();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "hardfloat") {
	clearModuleFeatureBits(Mips::FeatureSoftFloat, "soft-float");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleHardFloat();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "mt") {
	setModuleFeatureBits(Mips::FeatureMT, "mt");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleMT();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "crc") {
	setModuleFeatureBits(Mips::FeatureCRC, "crc");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleCRC();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "nocrc") {
	clearModuleFeatureBits(Mips::FeatureCRC, "crc");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleNoCRC();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "virt") {
	setModuleFeatureBits(Mips::FeatureVirt, "virt");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleVirt();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "novirt") {
	clearModuleFeatureBits(Mips::FeatureVirt, "virt");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleNoVirt();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "ginv") {
	setModuleFeatureBits(Mips::FeatureGINV, "ginv");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleGINV();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else if (Option == "noginv") {
	clearModuleFeatureBits(Mips::FeatureGINV, "ginv");

	// Synchronize the ABI Flags information with the FeatureBits information we
	// updated above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated ABI Flags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted later).
	getTargetStreamer().emitDirectiveModuleNoGINV();

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	return false; // parseDirectiveModule has finished successfully.
	} else {
	return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
	}
	}

	/// parseDirectiveModuleFP
	/// ::= =32
	/// ::= =xx
	/// ::= =64
	bool MipsAsmParser::parseDirectiveModuleFP() {
	MCAsmParser &Parser = getParser();
	MCAsmLexer &Lexer = getLexer();

	if (Lexer.isNot(AsmToken::Equal)) {
	reportParseError("unexpected token, expected equals sign '='");
	return false;
	}
	Parser.Lex(); // Eat '=' token.

	MipsABIFlagsSection::FpABIKind FpABI;
	if (!parseFpABIValue(FpABI, ".module"))
	return false;

	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	// Synchronize the abiflags information with the FeatureBits information we
	// changed above.
	getTargetStreamer().updateABIInfo(*this);

	// If printing assembly, use the recently updated abiflags information.
	// If generating ELF, don't do anything (the .MIPS.abiflags section gets
	// emitted at the end).
	getTargetStreamer().emitDirectiveModuleFP();

	Parser.Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
	StringRef Directive) {
	MCAsmParser &Parser = getParser();
	MCAsmLexer &Lexer = getLexer();
	bool ModuleLevelOptions = Directive == ".module";

	if (Lexer.is(AsmToken::Identifier)) {
	StringRef Value = Parser.getTok().getString();
	Parser.Lex();

	if (Value != "xx") {
	reportParseError("unsupported value, expected 'xx', '32' or '64'");
	return false;
	}

	if (!isABI_O32()) {
	reportParseError("'" + Directive + " fp=xx' requires the O32 ABI");
	return false;
	}

	FpABI = MipsABIFlagsSection::FpABIKind::XX;
	if (ModuleLevelOptions) {
	setModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
	clearModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
	} else {
	setFeatureBits(Mips::FeatureFPXX, "fpxx");
	clearFeatureBits(Mips::FeatureFP64Bit, "fp64");
	}
	return true;
	}

	if (Lexer.is(AsmToken::Integer)) {
	unsigned Value = Parser.getTok().getIntVal();
	Parser.Lex();

	if (Value != 32 && Value != 64) {
	reportParseError("unsupported value, expected 'xx', '32' or '64'");
	return false;
	}

	if (Value == 32) {
	if (!isABI_O32()) {
	reportParseError("'" + Directive + " fp=32' requires the O32 ABI");
	return false;
	}

	FpABI = MipsABIFlagsSection::FpABIKind::S32;
	if (ModuleLevelOptions) {
	clearModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
	clearModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
	} else {
	clearFeatureBits(Mips::FeatureFPXX, "fpxx");
	clearFeatureBits(Mips::FeatureFP64Bit, "fp64");
	}
	} else {
	FpABI = MipsABIFlagsSection::FpABIKind::S64;
	if (ModuleLevelOptions) {
	clearModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
	setModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
	} else {
	clearFeatureBits(Mips::FeatureFPXX, "fpxx");
	setFeatureBits(Mips::FeatureFP64Bit, "fp64");
	}
	}

	return true;
	}

	return false;
	}

	bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
	// This returns false if this function recognizes the directive
	// regardless of whether it is successfully handles or reports an
	// error. Otherwise it returns true to give the generic parser a
	// chance at recognizing it.

	MCAsmParser &Parser = getParser();
	StringRef IDVal = DirectiveID.getString();

	if (IDVal == ".cpload") {
	parseDirectiveCpLoad(DirectiveID.getLoc());
	return false;
	}
	if (IDVal == ".cprestore") {
	parseDirectiveCpRestore(DirectiveID.getLoc());
	return false;
	}
	if (IDVal == ".cplocal") {
	parseDirectiveCpLocal(DirectiveID.getLoc());
	return false;
	}
	if (IDVal == ".ent") {
	StringRef SymbolName;

	if (Parser.parseIdentifier(SymbolName)) {
	reportParseError("expected identifier after .ent");
	return false;
	}

	// There's an undocumented extension that allows an integer to
	// follow the name of the procedure which AFAICS is ignored by GAS.
	// Example: .ent foo,2
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	if (getLexer().isNot(AsmToken::Comma)) {
	// Even though we accept this undocumented extension for compatibility
	// reasons, the additional integer argument does not actually change
	// the behaviour of the '.ent' directive, so we would like to discourage
	// its use. We do this by not referring to the extended version in
	// error messages which are not directly related to its use.
	reportParseError("unexpected token, expected end of statement");
	return false;
	}
	Parser.Lex(); // Eat the comma.
	const MCExpr *DummyNumber;
	int64_t DummyNumberVal;
	// If the user was explicitly trying to use the extended version,
	// we still give helpful extension-related error messages.
	if (Parser.parseExpression(DummyNumber)) {
	reportParseError("expected number after comma");
	return false;
	}
	if (!DummyNumber->evaluateAsAbsolute(DummyNumberVal)) {
	reportParseError("expected an absolute expression after comma");
	return false;
	}
	}

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);

	getTargetStreamer().emitDirectiveEnt(*Sym);
	CurrentFn = Sym;
	IsCpRestoreSet = false;
	return false;
	}

	if (IDVal == ".end") {
	StringRef SymbolName;

	if (Parser.parseIdentifier(SymbolName)) {
	reportParseError("expected identifier after .end");
	return false;
	}

	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	if (CurrentFn == nullptr) {
	reportParseError(".end used without .ent");
	return false;
	}

	if ((SymbolName != CurrentFn->getName())) {
	reportParseError(".end symbol does not match .ent symbol");
	return false;
	}

	getTargetStreamer().emitDirectiveEnd(SymbolName);
	CurrentFn = nullptr;
	IsCpRestoreSet = false;
	return false;
	}

	if (IDVal == ".frame") {
	// .frame $stack_reg, frame_size_in_bytes, $return_reg
	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
	OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
	if (ResTy == MatchOperand_NoMatch \|\| ResTy == MatchOperand_ParseFail) {
	reportParseError("expected stack register");
	return false;
	}

	MipsOperand &StackRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
	if (!StackRegOpnd.isGPRAsmReg()) {
	reportParseError(StackRegOpnd.getStartLoc(),
	"expected general purpose register");
	return false;
	}
	unsigned StackReg = StackRegOpnd.getGPR32Reg();

	if (Parser.getTok().is(AsmToken::Comma))
	Parser.Lex();
	else {
	reportParseError("unexpected token, expected comma");
	return false;
	}

	// Parse the frame size.
	const MCExpr *FrameSize;
	int64_t FrameSizeVal;

	if (Parser.parseExpression(FrameSize)) {
	reportParseError("expected frame size value");
	return false;
	}

	if (!FrameSize->evaluateAsAbsolute(FrameSizeVal)) {
	reportParseError("frame size not an absolute expression");
	return false;
	}

	if (Parser.getTok().is(AsmToken::Comma))
	Parser.Lex();
	else {
	reportParseError("unexpected token, expected comma");
	return false;
	}

	// Parse the return register.
	TmpReg.clear();
	ResTy = parseAnyRegister(TmpReg);
	if (ResTy == MatchOperand_NoMatch \|\| ResTy == MatchOperand_ParseFail) {
	reportParseError("expected return register");
	return false;
	}

	MipsOperand &ReturnRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
	if (!ReturnRegOpnd.isGPRAsmReg()) {
	reportParseError(ReturnRegOpnd.getStartLoc(),
	"expected general purpose register");
	return false;
	}

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	getTargetStreamer().emitFrame(StackReg, FrameSizeVal,
	ReturnRegOpnd.getGPR32Reg());
	IsCpRestoreSet = false;
	return false;
	}

	if (IDVal == ".set") {
	parseDirectiveSet();
	return false;
	}

	if (IDVal == ".mask" \|\| IDVal == ".fmask") {
	// .mask bitmask, frame_offset
	// bitmask: One bit for each register used.
	// frame_offset: Offset from Canonical Frame Address ($sp on entry) where
	// first register is expected to be saved.
	// Examples:
	// .mask 0x80000000, -4
	// .fmask 0x80000000, -4
	//

	// Parse the bitmask
	const MCExpr *BitMask;
	int64_t BitMaskVal;

	if (Parser.parseExpression(BitMask)) {
	reportParseError("expected bitmask value");
	return false;
	}

	if (!BitMask->evaluateAsAbsolute(BitMaskVal)) {
	reportParseError("bitmask not an absolute expression");
	return false;
	}

	if (Parser.getTok().is(AsmToken::Comma))
	Parser.Lex();
	else {
	reportParseError("unexpected token, expected comma");
	return false;
	}

	// Parse the frame_offset
	const MCExpr *FrameOffset;
	int64_t FrameOffsetVal;

	if (Parser.parseExpression(FrameOffset)) {
	reportParseError("expected frame offset value");
	return false;
	}

	if (!FrameOffset->evaluateAsAbsolute(FrameOffsetVal)) {
	reportParseError("frame offset not an absolute expression");
	return false;
	}

	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	if (IDVal == ".mask")
	getTargetStreamer().emitMask(BitMaskVal, FrameOffsetVal);
	else
	getTargetStreamer().emitFMask(BitMaskVal, FrameOffsetVal);
	return false;
	}

	if (IDVal == ".nan")
	return parseDirectiveNaN();

	if (IDVal == ".gpword") {
	parseDirectiveGpWord();
	return false;
	}

	if (IDVal == ".gpdword") {
	parseDirectiveGpDWord();
	return false;
	}

	if (IDVal == ".dtprelword") {
	parseDirectiveDtpRelWord();
	return false;
	}

	if (IDVal == ".dtpreldword") {
	parseDirectiveDtpRelDWord();
	return false;
	}

	if (IDVal == ".tprelword") {
	parseDirectiveTpRelWord();
	return false;
	}

	if (IDVal == ".tpreldword") {
	parseDirectiveTpRelDWord();
	return false;
	}

	if (IDVal == ".option") {
	parseDirectiveOption();
	return false;
	}

	if (IDVal == ".abicalls") {
	getTargetStreamer().emitDirectiveAbiCalls();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
	Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");
	}
	return false;
	}

	if (IDVal == ".cpsetup") {
	parseDirectiveCPSetup();
	return false;
	}
	if (IDVal == ".cpreturn") {
	parseDirectiveCPReturn();
	return false;
	}
	if (IDVal == ".module") {
	parseDirectiveModule();
	return false;
	}
	if (IDVal == ".llvm_internal_mips_reallow_module_directive") {
	parseInternalDirectiveReallowModule();
	return false;
	}
	if (IDVal == ".insn") {
	parseInsnDirective();
	return false;
	}
	if (IDVal == ".rdata") {
	parseRSectionDirective(".rodata");
	return false;
	}
	if (IDVal == ".sbss") {
	parseSSectionDirective(IDVal, ELF::SHT_NOBITS);
	return false;
	}
	if (IDVal == ".sdata") {
	parseSSectionDirective(IDVal, ELF::SHT_PROGBITS);
	return false;
	}

	return true;
	}

	bool MipsAsmParser::parseInternalDirectiveReallowModule() {
	// If this is not the end of the statement, report an error.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	reportParseError("unexpected token, expected end of statement");
	return false;
	}

	getTargetStreamer().reallowModuleDirective();

	getParser().Lex(); // Eat EndOfStatement token.
	return false;
	}

	extern "C" void LLVMInitializeMipsAsmParser() {
	RegisterMCAsmParser<MipsAsmParser> X(getTheMipsTarget());
	RegisterMCAsmParser<MipsAsmParser> Y(getTheMipselTarget());
	RegisterMCAsmParser<MipsAsmParser> A(getTheMips64Target());
	RegisterMCAsmParser<MipsAsmParser> B(getTheMips64elTarget());
	}

	#define GET_REGISTER_MATCHER
	#define GET_MATCHER_IMPLEMENTATION
	#define GET_MNEMONIC_SPELL_CHECKER
	#include "MipsGenAsmMatcher.inc"

	bool MipsAsmParser::mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) {
	// Find the appropriate table for this asm variant.
	const MatchEntry Start, End;
	switch (VariantID) {
	default: llvm_unreachable("invalid variant!");
	case 0: Start = std::begin(MatchTable0); End = std::end(MatchTable0); break;
	}
	// Search the table.
	auto MnemonicRange = std::equal_range(Start, End, Mnemonic, LessOpcode());
	return MnemonicRange.first != MnemonicRange.second;
	}
	Index: projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp (revision 351722)
	@@ -1,289 +1,292 @@
	//===-- RISCVISelDAGToDAG.cpp - A dag to dag inst selector for RISCV ------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines an instruction selector for the RISCV target.
	//
	//===----------------------------------------------------------------------===//

	#include "MCTargetDesc/RISCVMCTargetDesc.h"
	#include "RISCV.h"
	#include "RISCVTargetMachine.h"
	#include "Utils/RISCVMatInt.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	using namespace llvm;

	#define DEBUG_TYPE "riscv-isel"

	// RISCV-specific code to select RISCV machine instructions for
	// SelectionDAG operations.
	namespace {
	class RISCVDAGToDAGISel final : public SelectionDAGISel {
	const RISCVSubtarget *Subtarget;

	public:
	explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine)
	: SelectionDAGISel(TargetMachine) {}

	StringRef getPassName() const override {
	return "RISCV DAG->DAG Pattern Instruction Selection";
	}

	bool runOnMachineFunction(MachineFunction &MF) override {
	Subtarget = &MF.getSubtarget<RISCVSubtarget>();
	return SelectionDAGISel::runOnMachineFunction(MF);
	}

	void PostprocessISelDAG() override;

	void Select(SDNode *Node) override;

	bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
	std::vector<SDValue> &OutOps) override;

	bool SelectAddrFI(SDValue Addr, SDValue &Base);

	// Include the pieces autogenerated from the target description.
	#include "RISCVGenDAGISel.inc"

	private:
	void doPeepholeLoadStoreADDI();
	};
	}

	void RISCVDAGToDAGISel::PostprocessISelDAG() {
	doPeepholeLoadStoreADDI();
	}

	static SDNode selectImm(SelectionDAG CurDAG, const SDLoc &DL, int64_t Imm,
	MVT XLenVT) {
	RISCVMatInt::InstSeq Seq;
	RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq);

	SDNode *Result;
	SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
	for (RISCVMatInt::Inst &Inst : Seq) {
	SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
	if (Inst.Opc == RISCV::LUI)
	Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm);
	else
	Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm);

	// Only the first instruction has X0 as its source.
	SrcReg = SDValue(Result, 0);
	}

	return Result;
	}

	// Returns true if the Node is an ISD::AND with a constant argument. If so,
	// set Mask to that constant value.
	static bool isConstantMask(SDNode *Node, uint64_t &Mask) {
	if (Node->getOpcode() == ISD::AND &&
	Node->getOperand(1).getOpcode() == ISD::Constant) {
	Mask = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
	return true;
	}
	return false;
	}

	void RISCVDAGToDAGISel::Select(SDNode *Node) {
	// If we have a custom node, we have already selected.
	if (Node->isMachineOpcode()) {
	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
	Node->setNodeId(-1);
	return;
	}

	// Instruction Selection not handled by the auto-generated tablegen selection
	// should be handled here.
	unsigned Opcode = Node->getOpcode();
	MVT XLenVT = Subtarget->getXLenVT();
	SDLoc DL(Node);
	EVT VT = Node->getValueType(0);

	switch (Opcode) {
	case ISD::Constant: {
	auto ConstNode = cast<ConstantSDNode>(Node);
	if (VT == XLenVT && ConstNode->isNullValue()) {
	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
	RISCV::X0, XLenVT);
	ReplaceNode(Node, New.getNode());
	return;
	}
	int64_t Imm = ConstNode->getSExtValue();
	if (XLenVT == MVT::i64) {
	ReplaceNode(Node, selectImm(CurDAG, SDLoc(Node), Imm, XLenVT));
	return;
	}
	break;
	}
	case ISD::FrameIndex: {
	SDValue Imm = CurDAG->getTargetConstant(0, DL, XLenVT);
	int FI = cast<FrameIndexSDNode>(Node)->getIndex();
	SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
	ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
	return;
	}
	case ISD::SRL: {
	if (!Subtarget->is64Bit())
	break;
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	uint64_t Mask;
	// Match (srl (and val, mask), imm) where the result would be a
	// zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
	// is equivalent to this (SimplifyDemandedBits may have removed lower bits
	// from the mask that aren't necessary due to the right-shifting).
	if (Op1.getOpcode() == ISD::Constant &&
	isConstantMask(Op0.getNode(), Mask)) {
	uint64_t ShAmt = cast<ConstantSDNode>(Op1.getNode())->getZExtValue();

	if ((Mask \| maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff) {
	SDValue ShAmtVal =
	CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT);
	CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0),
	ShAmtVal);
	return;
	}
	}
	break;
	}
	case RISCVISD::READ_CYCLE_WIDE:
	assert(!Subtarget->is64Bit() && "READ_CYCLE_WIDE is only used on riscv32");

	ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ReadCycleWide, DL, MVT::i32,
	MVT::i32, MVT::Other,
	Node->getOperand(0)));
	return;
	}

	// Select the default instruction.
	SelectCode(Node);
	}

	bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
	const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
	switch (ConstraintID) {
	case InlineAsm::Constraint_i:
	case InlineAsm::Constraint_m:
	// We just support simple memory operands that have a single address
	// operand and need no special handling.
	OutOps.push_back(Op);
	return false;
	+ case InlineAsm::Constraint_A:
	+ OutOps.push_back(Op);
	+ return false;
	default:
	break;
	}

	return true;
	}

	bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
	if (auto FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
	Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
	return true;
	}
	return false;
	}

	// Merge an ADDI into the offset of a load/store instruction where possible.
	// (load (add base, off), 0) -> (load base, off)
	// (store val, (add base, off)) -> (store val, base, off)
	void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
	SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
	++Position;

	while (Position != CurDAG->allnodes_begin()) {
	SDNode N = &--Position;
	// Skip dead nodes and any non-machine opcodes.
	if (N->use_empty() \|\| !N->isMachineOpcode())
	continue;

	int OffsetOpIdx;
	int BaseOpIdx;

	// Only attempt this optimisation for I-type loads and S-type stores.
	switch (N->getMachineOpcode()) {
	default:
	continue;
	case RISCV::LB:
	case RISCV::LH:
	case RISCV::LW:
	case RISCV::LBU:
	case RISCV::LHU:
	case RISCV::LWU:
	case RISCV::LD:
	case RISCV::FLW:
	case RISCV::FLD:
	BaseOpIdx = 0;
	OffsetOpIdx = 1;
	break;
	case RISCV::SB:
	case RISCV::SH:
	case RISCV::SW:
	case RISCV::SD:
	case RISCV::FSW:
	case RISCV::FSD:
	BaseOpIdx = 1;
	OffsetOpIdx = 2;
	break;
	}

	// Currently, the load/store offset must be 0 to be considered for this
	// peephole optimisation.
	if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)) \|\|
	N->getConstantOperandVal(OffsetOpIdx) != 0)
	continue;

	SDValue Base = N->getOperand(BaseOpIdx);

	// If the base is an ADDI, we can merge it in to the load/store.
	if (!Base.isMachineOpcode() \|\| Base.getMachineOpcode() != RISCV::ADDI)
	continue;

	SDValue ImmOperand = Base.getOperand(1);

	if (auto Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
	ImmOperand = CurDAG->getTargetConstant(
	Const->getSExtValue(), SDLoc(ImmOperand), ImmOperand.getValueType());
	} else if (auto GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
	ImmOperand = CurDAG->getTargetGlobalAddress(
	GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
	GA->getOffset(), GA->getTargetFlags());
	} else {
	continue;
	}

	LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
	LLVM_DEBUG(Base->dump(CurDAG));
	LLVM_DEBUG(dbgs() << "\nN: ");
	LLVM_DEBUG(N->dump(CurDAG));
	LLVM_DEBUG(dbgs() << "\n");

	// Modify the offset operand of the load/store.
	if (BaseOpIdx == 0) // Load
	CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
	N->getOperand(2));
	else // Store
	CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
	ImmOperand, N->getOperand(3));

	// The add-immediate may now be dead, in which case remove it.
	if (Base.getNode()->use_empty())
	CurDAG->RemoveDeadNode(Base.getNode());
	}
	}

	// This pass converts a legalized DAG into a RISCV-specific DAG, ready
	// for instruction scheduling.
	FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM) {
	return new RISCVDAGToDAGISel(TM);
	}
	Index: projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.cpp (revision 351722)
	@@ -1,2648 +1,2665 @@
	//===-- RISCVISelLowering.cpp - RISCV DAG Lowering Implementation --------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that RISCV uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "RISCVISelLowering.h"
	#include "RISCV.h"
	#include "RISCVMachineFunctionInfo.h"
	#include "RISCVRegisterInfo.h"
	#include "RISCVSubtarget.h"
	#include "RISCVTargetMachine.h"
	#include "Utils/RISCVMatInt.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/DiagnosticPrinter.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;

	#define DEBUG_TYPE "riscv-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");

	RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
	const RISCVSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {

	if (Subtarget.isRV32E())
	report_fatal_error("Codegen not yet implemented for RV32E");

	RISCVABI::ABI ABI = Subtarget.getTargetABI();
	assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");

	switch (ABI) {
	default:
	report_fatal_error("Don't know how to lower this ABI");
	case RISCVABI::ABI_ILP32:
	case RISCVABI::ABI_ILP32F:
	case RISCVABI::ABI_ILP32D:
	case RISCVABI::ABI_LP64:
	case RISCVABI::ABI_LP64F:
	case RISCVABI::ABI_LP64D:
	break;
	}

	MVT XLenVT = Subtarget.getXLenVT();

	// Set up the register classes.
	addRegisterClass(XLenVT, &RISCV::GPRRegClass);

	if (Subtarget.hasStdExtF())
	addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
	if (Subtarget.hasStdExtD())
	addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);

	// Compute derived properties from the register classes.
	computeRegisterProperties(STI.getRegisterInfo());

	setStackPointerRegisterToSaveRestore(RISCV::X2);

	for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD})
	setLoadExtAction(N, XLenVT, MVT::i1, Promote);

	// TODO: add all necessary setOperationAction calls.
	setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);

	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, XLenVT, Expand);
	setOperationAction(ISD::SELECT, XLenVT, Custom);
	setOperationAction(ISD::SELECT_CC, XLenVT, Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Expand);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::SHL, MVT::i32, Custom);
	setOperationAction(ISD::SRA, MVT::i32, Custom);
	setOperationAction(ISD::SRL, MVT::i32, Custom);
	}

	if (!Subtarget.hasStdExtM()) {
	setOperationAction(ISD::MUL, XLenVT, Expand);
	setOperationAction(ISD::MULHS, XLenVT, Expand);
	setOperationAction(ISD::MULHU, XLenVT, Expand);
	setOperationAction(ISD::SDIV, XLenVT, Expand);
	setOperationAction(ISD::UDIV, XLenVT, Expand);
	setOperationAction(ISD::SREM, XLenVT, Expand);
	setOperationAction(ISD::UREM, XLenVT, Expand);
	}

	if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
	setOperationAction(ISD::SDIV, MVT::i32, Custom);
	setOperationAction(ISD::UDIV, MVT::i32, Custom);
	setOperationAction(ISD::UREM, MVT::i32, Custom);
	}

	setOperationAction(ISD::SDIVREM, XLenVT, Expand);
	setOperationAction(ISD::UDIVREM, XLenVT, Expand);
	setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
	setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);

	setOperationAction(ISD::SHL_PARTS, XLenVT, Custom);
	setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
	setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);

	setOperationAction(ISD::ROTL, XLenVT, Expand);
	setOperationAction(ISD::ROTR, XLenVT, Expand);
	setOperationAction(ISD::BSWAP, XLenVT, Expand);
	setOperationAction(ISD::CTTZ, XLenVT, Expand);
	setOperationAction(ISD::CTLZ, XLenVT, Expand);
	setOperationAction(ISD::CTPOP, XLenVT, Expand);

	ISD::CondCode FPCCToExtend[] = {
	ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
	ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
	ISD::SETGE, ISD::SETNE};

	ISD::NodeType FPOpToExtend[] = {
	ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM};

	if (Subtarget.hasStdExtF()) {
	setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
	for (auto CC : FPCCToExtend)
	setCondCodeAction(CC, MVT::f32, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Expand);
	for (auto Op : FPOpToExtend)
	setOperationAction(Op, MVT::f32, Expand);
	}

	if (Subtarget.hasStdExtF() && Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST, MVT::i32, Custom);

	if (Subtarget.hasStdExtD()) {
	setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
	for (auto CC : FPCCToExtend)
	setCondCodeAction(CC, MVT::f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	for (auto Op : FPOpToExtend)
	setOperationAction(Op, MVT::f64, Expand);
	}

	setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
	setOperationAction(ISD::BlockAddress, XLenVT, Custom);
	setOperationAction(ISD::ConstantPool, XLenVT, Custom);

	setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);

	// TODO: On M-mode only targets, the cycle[h] CSR may not be present.
	// Unfortunately this can't be determined just from the ISA naming string.
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
	Subtarget.is64Bit() ? Legal : Custom);

	if (Subtarget.hasStdExtA()) {
	setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
	setMinCmpXchgSizeInBits(32);
	} else {
	setMaxAtomicSizeInBitsSupported(0);
	}

	setBooleanContents(ZeroOrOneBooleanContent);

	// Function alignments (log2).
	unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2;
	setMinFunctionAlignment(FunctionAlignment);
	setPrefFunctionAlignment(FunctionAlignment);

	// Effectively disable jump table generation.
	setMinimumJumpTableEntries(INT_MAX);
	}

	EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return getPointerTy(DL);
	return VT.changeVectorElementTypeToInteger();
	}

	bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	switch (Intrinsic) {
	default:
	return false;
	case Intrinsic::riscv_masked_atomicrmw_xchg_i32:
	case Intrinsic::riscv_masked_atomicrmw_add_i32:
	case Intrinsic::riscv_masked_atomicrmw_sub_i32:
	case Intrinsic::riscv_masked_atomicrmw_nand_i32:
	case Intrinsic::riscv_masked_atomicrmw_max_i32:
	case Intrinsic::riscv_masked_atomicrmw_min_i32:
	case Intrinsic::riscv_masked_atomicrmw_umax_i32:
	case Intrinsic::riscv_masked_atomicrmw_umin_i32:
	case Intrinsic::riscv_masked_cmpxchg_i32:
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = 4;
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore \|
	MachineMemOperand::MOVolatile;
	return true;
	}
	}

	bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// Require a 12-bit signed offset.
	if (!isInt<12>(AM.BaseOffs))
	return false;

	switch (AM.Scale) {
	case 0: // "r+i" or just "i", depending on HasBaseReg.
	break;
	case 1:
	if (!AM.HasBaseReg) // allow "r+i".
	break;
	return false; // disallow "r+r" or "r+r+i".
	default:
	return false;
	}

	return true;
	}

	bool RISCVTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<12>(Imm);
	}

	bool RISCVTargetLowering::isLegalAddImmediate(int64_t Imm) const {
	return isInt<12>(Imm);
	}

	// On RV32, 64-bit integers are split into their high and low parts and held
	// in two different registers, so the trunc is free since the low register can
	// just be used.
	bool RISCVTargetLowering::isTruncateFree(Type SrcTy, Type DstTy) const {
	if (Subtarget.is64Bit() \|\| !SrcTy->isIntegerTy() \|\| !DstTy->isIntegerTy())
	return false;
	unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
	unsigned DestBits = DstTy->getPrimitiveSizeInBits();
	return (SrcBits == 64 && DestBits == 32);
	}

	bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
	if (Subtarget.is64Bit() \|\| SrcVT.isVector() \|\| DstVT.isVector() \|\|
	!SrcVT.isInteger() \|\| !DstVT.isInteger())
	return false;
	unsigned SrcBits = SrcVT.getSizeInBits();
	unsigned DestBits = DstVT.getSizeInBits();
	return (SrcBits == 64 && DestBits == 32);
	}

	bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	// Zexts are free if they can be combined with a load.
	if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
	EVT MemVT = LD->getMemoryVT();
	if ((MemVT == MVT::i8 \|\| MemVT == MVT::i16 \|\|
	(Subtarget.is64Bit() && MemVT == MVT::i32)) &&
	(LD->getExtensionType() == ISD::NON_EXTLOAD \|\|
	LD->getExtensionType() == ISD::ZEXTLOAD))
	return true;
	}

	return TargetLowering::isZExtFree(Val, VT2);
	}

	bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
	return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
	}

	bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
	return (VT == MVT::f32 && Subtarget.hasStdExtF()) \|\|
	(VT == MVT::f64 && Subtarget.hasStdExtD());
	}

	// Changes the condition code and swaps operands if necessary, so the SetCC
	// operation matches one of the comparisons supported directly in the RISC-V
	// ISA.
	static void normaliseSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
	switch (CC) {
	default:
	break;
	case ISD::SETGT:
	case ISD::SETLE:
	case ISD::SETUGT:
	case ISD::SETULE:
	CC = ISD::getSetCCSwappedOperands(CC);
	std::swap(LHS, RHS);
	break;
	}
	}

	// Return the RISC-V branch opcode that matches the given DAG integer
	// condition code. The CondCode must be one of those supported by the RISC-V
	// ISA (see normaliseSetCC).
	static unsigned getBranchOpcodeForIntCondCode(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unsupported CondCode");
	case ISD::SETEQ:
	return RISCV::BEQ;
	case ISD::SETNE:
	return RISCV::BNE;
	case ISD::SETLT:
	return RISCV::BLT;
	case ISD::SETGE:
	return RISCV::BGE;
	case ISD::SETULT:
	return RISCV::BLTU;
	case ISD::SETUGE:
	return RISCV::BGEU;
	}
	}

	SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default:
	report_fatal_error("unimplemented operand");
	case ISD::GlobalAddress:
	return lowerGlobalAddress(Op, DAG);
	case ISD::BlockAddress:
	return lowerBlockAddress(Op, DAG);
	case ISD::ConstantPool:
	return lowerConstantPool(Op, DAG);
	case ISD::GlobalTLSAddress:
	return lowerGlobalTLSAddress(Op, DAG);
	case ISD::SELECT:
	return lowerSELECT(Op, DAG);
	case ISD::VASTART:
	return lowerVASTART(Op, DAG);
	case ISD::FRAMEADDR:
	return lowerFRAMEADDR(Op, DAG);
	case ISD::RETURNADDR:
	return lowerRETURNADDR(Op, DAG);
	case ISD::SHL_PARTS:
	return lowerShiftLeftParts(Op, DAG);
	case ISD::SRA_PARTS:
	return lowerShiftRightParts(Op, DAG, true);
	case ISD::SRL_PARTS:
	return lowerShiftRightParts(Op, DAG, false);
	case ISD::BITCAST: {
	assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() &&
	"Unexpected custom legalisation");
	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	if (Op.getValueType() != MVT::f32 \|\| Op0.getValueType() != MVT::i32)
	return SDValue();
	SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
	SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
	return FPConv;
	}
	}
	}

	static SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
	}

	static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
	Flags);
	}

	static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
	N->getOffset(), Flags);
	}

	template <class NodeTy>
	SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
	bool IsLocal) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());

	if (isPositionIndependent()) {
	SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
	if (IsLocal)
	// Use PC-relative addressing to access the symbol. This generates the
	// pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym))
	// %pcrel_lo(auipc)).
	return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);

	// Use PC-relative addressing to access the GOT for this symbol, then load
	// the address from the GOT. This generates the pattern (PseudoLA sym),
	// which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))).
	return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0);
	}

	switch (getTargetMachine().getCodeModel()) {
	default:
	report_fatal_error("Unsupported code model for lowering");
	case CodeModel::Small: {
	// Generate a sequence for accessing addresses within the first 2 GiB of
	// address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
	SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
	SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
	SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
	return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0);
	}
	case CodeModel::Medium: {
	// Generate a sequence for accessing addresses within any 2GiB range within
	// the address space. This generates the pattern (PseudoLLA sym), which
	// expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
	return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
	}
	}
	}

	SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT Ty = Op.getValueType();
	GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
	int64_t Offset = N->getOffset();
	MVT XLenVT = Subtarget.getXLenVT();

	const GlobalValue *GV = N->getGlobal();
	bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
	SDValue Addr = getAddr(N, DAG, IsLocal);

	// In order to maximise the opportunity for common subexpression elimination,
	// emit a separate ADD node for the global address offset instead of folding
	// it in the global address node. Later peephole optimisations may choose to
	// fold it back in when profitable.
	if (Offset != 0)
	return DAG.getNode(ISD::ADD, DL, Ty, Addr,
	DAG.getConstant(Offset, DL, XLenVT));
	return Addr;
	}

	SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);

	return getAddr(N, DAG);
	}

	SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);

	return getAddr(N, DAG);
	}

	SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
	SelectionDAG &DAG,
	bool UseGOT) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	const GlobalValue *GV = N->getGlobal();
	MVT XLenVT = Subtarget.getXLenVT();

	if (UseGOT) {
	// Use PC-relative addressing to access the GOT for this TLS symbol, then
	// load the address from the GOT and add the thread pointer. This generates
	// the pattern (PseudoLA_TLS_IE sym), which expands to
	// (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
	SDValue Load =
	SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0);

	// Add the thread pointer.
	SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
	return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg);
	}

	// Generate a sequence for accessing the address relative to the thread
	// pointer, with the appropriate adjustment for the thread pointer offset.
	// This generates the pattern
	// (add (add_tprel (lui %tprel_hi(sym)) tp %tprel_add(sym)) %tprel_lo(sym))
	SDValue AddrHi =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_HI);
	SDValue AddrAdd =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_ADD);
	SDValue AddrLo =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO);

	SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
	SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
	SDValue MNAdd = SDValue(
	DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd),
	0);
	return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0);
	}

	SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
	SelectionDAG &DAG) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	IntegerType CallTy = Type::getIntNTy(DAG.getContext(), Ty.getSizeInBits());
	const GlobalValue *GV = N->getGlobal();

	// Use a PC-relative addressing mode to access the global dynamic GOT address.
	// This generates the pattern (PseudoLA_TLS_GD sym), which expands to
	// (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
	SDValue Load =
	SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0);

	// Prepare argument list to generate call.
	ArgListTy Args;
	ArgListEntry Entry;
	Entry.Node = Load;
	Entry.Ty = CallTy;
	Args.push_back(Entry);

	// Setup call to __tls_get_addr.
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(DL)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, CallTy,
	DAG.getExternalSymbol("__tls_get_addr", Ty),
	std::move(Args));

	return LowerCallTo(CLI).first;
	}

	SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT Ty = Op.getValueType();
	GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
	int64_t Offset = N->getOffset();
	MVT XLenVT = Subtarget.getXLenVT();

	// Non-PIC TLS lowering should always use the LocalExec model.
	TLSModel::Model Model = isPositionIndependent()
	? getTargetMachine().getTLSModel(N->getGlobal())
	: TLSModel::LocalExec;

	SDValue Addr;
	switch (Model) {
	case TLSModel::LocalExec:
	Addr = getStaticTLSAddr(N, DAG, /UseGOT=/false);
	break;
	case TLSModel::InitialExec:
	Addr = getStaticTLSAddr(N, DAG, /UseGOT=/true);
	break;
	case TLSModel::LocalDynamic:
	case TLSModel::GeneralDynamic:
	Addr = getDynamicTLSAddr(N, DAG);
	break;
	}

	// In order to maximise the opportunity for common subexpression elimination,
	// emit a separate ADD node for the global address offset instead of folding
	// it in the global address node. Later peephole optimisations may choose to
	// fold it back in when profitable.
	if (Offset != 0)
	return DAG.getNode(ISD::ADD, DL, Ty, Addr,
	DAG.getConstant(Offset, DL, XLenVT));
	return Addr;
	}

	SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue CondV = Op.getOperand(0);
	SDValue TrueV = Op.getOperand(1);
	SDValue FalseV = Op.getOperand(2);
	SDLoc DL(Op);
	MVT XLenVT = Subtarget.getXLenVT();

	// If the result type is XLenVT and CondV is the output of a SETCC node
	// which also operated on XLenVT inputs, then merge the SETCC node into the
	// lowered RISCVISD::SELECT_CC to take advantage of the integer
	// compare+branch instructions. i.e.:
	// (select (setcc lhs, rhs, cc), truev, falsev)
	// -> (riscvisd::select_cc lhs, rhs, cc, truev, falsev)
	if (Op.getSimpleValueType() == XLenVT && CondV.getOpcode() == ISD::SETCC &&
	CondV.getOperand(0).getSimpleValueType() == XLenVT) {
	SDValue LHS = CondV.getOperand(0);
	SDValue RHS = CondV.getOperand(1);
	auto CC = cast<CondCodeSDNode>(CondV.getOperand(2));
	ISD::CondCode CCVal = CC->get();

	normaliseSetCC(LHS, RHS, CCVal);

	SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
	return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
	}

	// Otherwise:
	// (select condv, truev, falsev)
	// -> (riscvisd::select_cc condv, zero, setne, truev, falsev)
	SDValue Zero = DAG.getConstant(0, DL, XLenVT);
	SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};

	return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
	}

	SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();

	SDLoc DL(Op);
	SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
	getPointerTy(MF.getDataLayout()));

	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setFrameAddressIsTaken(true);
	unsigned FrameReg = RI.getFrameRegister(MF);
	int XLenInBytes = Subtarget.getXLen() / 8;

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	while (Depth--) {
	int Offset = -(XLenInBytes * 2);
	SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
	DAG.getIntPtrConstant(Offset, DL));
	FrameAddr =
	DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
	}
	return FrameAddr;
	}

	SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);
	MVT XLenVT = Subtarget.getXLenVT();
	int XLenInBytes = Subtarget.getXLen() / 8;

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	int Off = -XLenInBytes;
	SDValue FrameAddr = lowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(Off, DL, VT);
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return the value of the return address register, marking it an implicit
	// live-in.
	unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
	}

	SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Shamt = Op.getOperand(2);
	EVT VT = Lo.getValueType();

	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = Lo << Shamt
	// Hi = (Hi << Shamt) \| ((Lo >>u 1) >>u (XLEN-1 - Shamt))
	// else:
	// Lo = 0
	// Hi = Lo << (Shamt-XLEN)

	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue One = DAG.getConstant(1, DL, VT);
	SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
	SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
	SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
	SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);

	SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
	SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
	SDValue ShiftRightLo =
	DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, XLenMinus1Shamt);
	SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
	SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
	SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusXLen);

	SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);

	Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero);
	Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);

	SDValue Parts[2] = {Lo, Hi};
	return DAG.getMergeValues(Parts, DL);
	}

	SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
	bool IsSRA) const {
	SDLoc DL(Op);
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Shamt = Op.getOperand(2);
	EVT VT = Lo.getValueType();

	// SRA expansion:
	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = (Lo >>u Shamt) \| ((Hi << 1) << (XLEN-1 - Shamt))
	// Hi = Hi >>s Shamt
	// else:
	// Lo = Hi >>s (Shamt-XLEN);
	// Hi = Hi >>s (XLEN-1)
	//
	// SRL expansion:
	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = (Lo >>u Shamt) \| ((Hi << 1) << (XLEN-1 - Shamt))
	// Hi = Hi >>u Shamt
	// else:
	// Lo = Hi >>u (Shamt-XLEN);
	// Hi = 0;

	unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL;

	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue One = DAG.getConstant(1, DL, VT);
	SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
	SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
	SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
	SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);

	SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
	SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
	SDValue ShiftLeftHi =
	DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, XLenMinus1Shamt);
	SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi);
	SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt);
	SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusXLen);
	SDValue HiFalse =
	IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, XLenMinus1) : Zero;

	SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);

	Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse);
	Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);

	SDValue Parts[2] = {Lo, Hi};
	return DAG.getMergeValues(Parts, DL);
	}

	// Returns the opcode of the target-specific SDNode that implements the 32-bit
	// form of the given Opcode.
	static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
	switch (Opcode) {
	default:
	llvm_unreachable("Unexpected opcode");
	case ISD::SHL:
	return RISCVISD::SLLW;
	case ISD::SRA:
	return RISCVISD::SRAW;
	case ISD::SRL:
	return RISCVISD::SRLW;
	case ISD::SDIV:
	return RISCVISD::DIVW;
	case ISD::UDIV:
	return RISCVISD::DIVUW;
	case ISD::UREM:
	return RISCVISD::REMUW;
	}
	}

	// Converts the given 32-bit operation to a target-specific SelectionDAG node.
	// Because i32 isn't a legal type for RV64, these operations would otherwise
	// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
	// later one because the fact the operation was originally of type i32 is
	// lost.
	static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
	SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
	SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
	SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
	// ReplaceNodeResults requires we maintain the same type for the return value.
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
	}

	void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDLoc DL(N);
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom type legalize this operation!");
	case ISD::READCYCLECOUNTER: {
	assert(!Subtarget.is64Bit() &&
	"READCYCLECOUNTER only has custom type legalization on riscv32");

	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
	SDValue RCW =
	DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0));

	Results.push_back(RCW);
	Results.push_back(RCW.getValue(1));
	Results.push_back(RCW.getValue(2));
	break;
	}
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	"Unexpected custom legalisation");
	if (N->getOperand(1).getOpcode() == ISD::Constant)
	return;
	Results.push_back(customLegalizeToWOp(N, DAG));
	break;
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::UREM:
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	Subtarget.hasStdExtM() && "Unexpected custom legalisation");
	if (N->getOperand(0).getOpcode() == ISD::Constant \|\|
	N->getOperand(1).getOpcode() == ISD::Constant)
	return;
	Results.push_back(customLegalizeToWOp(N, DAG));
	break;
	case ISD::BITCAST: {
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	Subtarget.hasStdExtF() && "Unexpected custom legalisation");
	SDLoc DL(N);
	SDValue Op0 = N->getOperand(0);
	if (Op0.getValueType() != MVT::f32)
	return;
	SDValue FPConv =
	DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
	break;
	}
	}
	}

	SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	switch (N->getOpcode()) {
	default:
	break;
	case RISCVISD::SplitF64: {
	SDValue Op0 = N->getOperand(0);
	// If the input to SplitF64 is just BuildPairF64 then the operation is
	// redundant. Instead, use BuildPairF64's operands directly.
	if (Op0->getOpcode() == RISCVISD::BuildPairF64)
	return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));

	SDLoc DL(N);

	// It's cheaper to materialise two 32-bit integers than to load a double
	// from the constant pool and transfer it to integer registers through the
	// stack.
	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op0)) {
	APInt V = C->getValueAPF().bitcastToAPInt();
	SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32);
	SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32);
	return DCI.CombineTo(N, Lo, Hi);
	}

	// This is a target-specific version of a DAGCombine performed in
	// DAGCombiner::visitBITCAST. It performs the equivalent of:
	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	if (!(Op0.getOpcode() == ISD::FNEG \|\| Op0.getOpcode() == ISD::FABS) \|\|
	!Op0.getNode()->hasOneUse())
	break;
	SDValue NewSplitF64 =
	DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32),
	Op0.getOperand(0));
	SDValue Lo = NewSplitF64.getValue(0);
	SDValue Hi = NewSplitF64.getValue(1);
	APInt SignBit = APInt::getSignMask(32);
	if (Op0.getOpcode() == ISD::FNEG) {
	SDValue NewHi = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi,
	DAG.getConstant(SignBit, DL, MVT::i32));
	return DCI.CombineTo(N, Lo, NewHi);
	}
	assert(Op0.getOpcode() == ISD::FABS);
	SDValue NewHi = DAG.getNode(ISD::AND, DL, MVT::i32, Hi,
	DAG.getConstant(~SignBit, DL, MVT::i32));
	return DCI.CombineTo(N, Lo, NewHi);
	}
	case RISCVISD::SLLW:
	case RISCVISD::SRAW:
	case RISCVISD::SRLW: {
	// Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
	APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
	if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) \|\|
	(SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)))
	return SDValue();
	break;
	}
	case RISCVISD::FMV_X_ANYEXTW_RV64: {
	SDLoc DL(N);
	SDValue Op0 = N->getOperand(0);
	// If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the
	// conversion is unnecessary and can be replaced with an ANY_EXTEND
	// of the FMV_W_X_RV64 operand.
	if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
	SDValue AExtOp =
	DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0));
	return DCI.CombineTo(N, AExtOp);
	}

	// This is a target-specific version of a DAGCombine performed in
	// DAGCombiner::visitBITCAST. It performs the equivalent of:
	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	if (!(Op0.getOpcode() == ISD::FNEG \|\| Op0.getOpcode() == ISD::FABS) \|\|
	!Op0.getNode()->hasOneUse())
	break;
	SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64,
	Op0.getOperand(0));
	APInt SignBit = APInt::getSignMask(32).sext(64);
	if (Op0.getOpcode() == ISD::FNEG) {
	return DCI.CombineTo(N,
	DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
	DAG.getConstant(SignBit, DL, MVT::i64)));
	}
	assert(Op0.getOpcode() == ISD::FABS);
	return DCI.CombineTo(N,
	DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
	DAG.getConstant(~SignBit, DL, MVT::i64)));
	}
	}

	return SDValue();
	}

	bool RISCVTargetLowering::isDesirableToCommuteWithShift(
	const SDNode *N, CombineLevel Level) const {
	// The following folds are only desirable if `(OP _, c1 << c2)` can be
	// materialised in fewer instructions than `(OP _, c1)`:
	//
	// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
	// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
	SDValue N0 = N->getOperand(0);
	EVT Ty = N0.getValueType();
	if (Ty.isScalarInteger() &&
	(N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::OR)) {
	auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (C1 && C2) {
	APInt C1Int = C1->getAPIntValue();
	APInt ShiftedC1Int = C1Int << C2->getAPIntValue();

	// We can materialise `c1 << c2` into an add immediate, so it's "free",
	// and the combine should happen, to potentially allow further combines
	// later.
	if (ShiftedC1Int.getMinSignedBits() <= 64 &&
	isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
	return true;

	// We can materialise `c1` in an add immediate, so it's "free", and the
	// combine should be prevented.
	if (C1Int.getMinSignedBits() <= 64 &&
	isLegalAddImmediate(C1Int.getSExtValue()))
	return false;

	// Neither constant will fit into an immediate, so find materialisation
	// costs.
	int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(),
	Subtarget.is64Bit());
	int ShiftedC1Cost = RISCVMatInt::getIntMatCost(
	ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit());

	// Materialising `c1` is cheaper than materialising `c1 << c2`, so the
	// combine should be prevented.
	if (C1Cost < ShiftedC1Cost)
	return false;
	}
	}
	return true;
	}

	unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case RISCVISD::SLLW:
	case RISCVISD::SRAW:
	case RISCVISD::SRLW:
	case RISCVISD::DIVW:
	case RISCVISD::DIVUW:
	case RISCVISD::REMUW:
	// TODO: As the result is sign-extended, this is conservatively correct. A
	// more precise answer could be calculated for SRAW depending on known
	// bits in the shift amount.
	return 33;
	}

	return 1;
	}

	MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");

	// To read the 64-bit cycle CSR on a 32-bit target, we read the two halves.
	// Should the count have wrapped while it was being read, we need to try
	// again.
	// ...
	// read:
	// rdcycleh x3 # load high word of cycle
	// rdcycle x2 # load low word of cycle
	// rdcycleh x4 # load high word of cycle
	// bne x3, x4, read # check if high word reads match, otherwise try again
	// ...

	MachineFunction &MF = *BB->getParent();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MF.insert(It, LoopMBB);

	MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MF.insert(It, DoneMBB);

	// Transfer the remainder of BB and its successor edges to DoneMBB.
	DoneMBB->splice(DoneMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	DoneMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(LoopMBB);

	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	unsigned LoReg = MI.getOperand(0).getReg();
	unsigned HiReg = MI.getOperand(1).getReg();
	DebugLoc DL = MI.getDebugLoc();

	const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
	.addReg(RISCV::X0);
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding)
	.addReg(RISCV::X0);
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
	.addReg(RISCV::X0);

	BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
	.addReg(HiReg)
	.addReg(ReadAgainReg)
	.addMBB(LoopMBB);

	LoopMBB->addSuccessor(LoopMBB);
	LoopMBB->addSuccessor(DoneMBB);

	MI.eraseFromParent();

	return DoneMBB;
	}

	static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction");

	MachineFunction &MF = *BB->getParent();
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
	unsigned LoReg = MI.getOperand(0).getReg();
	unsigned HiReg = MI.getOperand(1).getReg();
	unsigned SrcReg = MI.getOperand(2).getReg();
	const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
	int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();

	TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
	RI);
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
	MachineMemOperand::MOLoad, 8, 8);
	BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
	.addFrameIndex(FI)
	.addImm(4)
	.addMemOperand(MMO);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::BuildPairF64Pseudo &&
	"Unexpected instruction");

	MachineFunction &MF = *BB->getParent();
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	unsigned LoReg = MI.getOperand(1).getReg();
	unsigned HiReg = MI.getOperand(2).getReg();
	const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
	int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
	MachineMemOperand::MOStore, 8, 8);
	BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
	.addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
	.addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
	.addFrameIndex(FI)
	.addImm(4)
	.addMemOperand(MMO);
	TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static bool isSelectPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;
	case RISCV::Select_GPR_Using_CC_GPR:
	case RISCV::Select_FPR32_Using_CC_GPR:
	case RISCV::Select_FPR64_Using_CC_GPR:
	return true;
	}
	}

	static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	// To "insert" Select_* instructions, we actually have to insert the triangle
	// control-flow pattern. The incoming instructions know the destination vreg
	// to set, the condition code register to branch on, the true/false values to
	// select between, and the condcode to use to select the appropriate branch.
	//
	// We produce the following control flow:
	// HeadMBB
	// \| \
	// \| IfFalseMBB
	// \| /
	// TailMBB
	//
	// When we find a sequence of selects we attempt to optimize their emission
	// by sharing the control flow. Currently we only handle cases where we have
	// multiple selects with the exact same condition (same LHS, RHS and CC).
	// The selects may be interleaved with other instructions if the other
	// instructions meet some requirements we deem safe:
	// - They are debug instructions. Otherwise,
	// - They do not have side-effects, do not access memory and their inputs do
	// not depend on the results of the select pseudo-instructions.
	// The TrueV/FalseV operands of the selects cannot depend on the result of
	// previous selects in the sequence.
	// These conditions could be further relaxed. See the X86 target for a
	// related approach and more information.
	unsigned LHS = MI.getOperand(1).getReg();
	unsigned RHS = MI.getOperand(2).getReg();
	auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());

	SmallVector<MachineInstr *, 4> SelectDebugValues;
	SmallSet<unsigned, 4> SelectDests;
	SelectDests.insert(MI.getOperand(0).getReg());

	MachineInstr *LastSelectPseudo = &MI;

	for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
	SequenceMBBI != E; ++SequenceMBBI) {
	if (SequenceMBBI->isDebugInstr())
	continue;
	else if (isSelectPseudo(*SequenceMBBI)) {
	if (SequenceMBBI->getOperand(1).getReg() != LHS \|\|
	SequenceMBBI->getOperand(2).getReg() != RHS \|\|
	SequenceMBBI->getOperand(3).getImm() != CC \|\|
	SelectDests.count(SequenceMBBI->getOperand(4).getReg()) \|\|
	SelectDests.count(SequenceMBBI->getOperand(5).getReg()))
	break;
	LastSelectPseudo = &*SequenceMBBI;
	SequenceMBBI->collectDebugValues(SelectDebugValues);
	SelectDests.insert(SequenceMBBI->getOperand(0).getReg());
	} else {
	if (SequenceMBBI->hasUnmodeledSideEffects() \|\|
	SequenceMBBI->mayLoadOrStore())
	break;
	if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
	return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
	}))
	break;
	}
	}

	const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator I = ++BB->getIterator();

	MachineBasicBlock *HeadMBB = BB;
	MachineFunction *F = BB->getParent();
	MachineBasicBlock *TailMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *IfFalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

	F->insert(I, IfFalseMBB);
	F->insert(I, TailMBB);

	// Transfer debug instructions associated with the selects to TailMBB.
	for (MachineInstr *DebugInstr : SelectDebugValues) {
	TailMBB->push_back(DebugInstr->removeFromParent());
	}

	// Move all instructions after the sequence to TailMBB.
	TailMBB->splice(TailMBB->end(), HeadMBB,
	std::next(LastSelectPseudo->getIterator()), HeadMBB->end());
	// Update machine-CFG edges by transferring all successors of the current
	// block to the new block which will contain the Phi nodes for the selects.
	TailMBB->transferSuccessorsAndUpdatePHIs(HeadMBB);
	// Set the successors for HeadMBB.
	HeadMBB->addSuccessor(IfFalseMBB);
	HeadMBB->addSuccessor(TailMBB);

	// Insert appropriate branch.
	unsigned Opcode = getBranchOpcodeForIntCondCode(CC);

	BuildMI(HeadMBB, DL, TII.get(Opcode))
	.addReg(LHS)
	.addReg(RHS)
	.addMBB(TailMBB);

	// IfFalseMBB just falls through to TailMBB.
	IfFalseMBB->addSuccessor(TailMBB);

	// Create PHIs for all of the select pseudo-instructions.
	auto SelectMBBI = MI.getIterator();
	auto SelectEnd = std::next(LastSelectPseudo->getIterator());
	auto InsertionPoint = TailMBB->begin();
	while (SelectMBBI != SelectEnd) {
	auto Next = std::next(SelectMBBI);
	if (isSelectPseudo(*SelectMBBI)) {
	// %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ]
	BuildMI(*TailMBB, InsertionPoint, SelectMBBI->getDebugLoc(),
	TII.get(RISCV::PHI), SelectMBBI->getOperand(0).getReg())
	.addReg(SelectMBBI->getOperand(4).getReg())
	.addMBB(HeadMBB)
	.addReg(SelectMBBI->getOperand(5).getReg())
	.addMBB(IfFalseMBB);
	SelectMBBI->eraseFromParent();
	}
	SelectMBBI = Next;
	}

	F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
	return TailMBB;
	}

	MachineBasicBlock *
	RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Unexpected instr type to insert");
	case RISCV::ReadCycleWide:
	assert(!Subtarget.is64Bit() &&
	"ReadCycleWrite is only to be used on riscv32");
	return emitReadCycleWidePseudo(MI, BB);
	case RISCV::Select_GPR_Using_CC_GPR:
	case RISCV::Select_FPR32_Using_CC_GPR:
	case RISCV::Select_FPR64_Using_CC_GPR:
	return emitSelectPseudo(MI, BB);
	case RISCV::BuildPairF64Pseudo:
	return emitBuildPairF64Pseudo(MI, BB);
	case RISCV::SplitF64Pseudo:
	return emitSplitF64Pseudo(MI, BB);
	}
	}

	// Calling Convention Implementation.
	// The expectations for frontend ABI lowering vary from target to target.
	// Ideally, an LLVM frontend would be able to avoid worrying about many ABI
	// details, but this is a longer term goal. For now, we simply try to keep the
	// role of the frontend as simple and well-defined as possible. The rules can
	// be summarised as:
	// * Never split up large scalar arguments. We handle them here.
	// * If a hardfloat calling convention is being used, and the struct may be
	// passed in a pair of registers (fp+fp, int+fp), and both registers are
	// available, then pass as two separate arguments. If either the GPRs or FPRs
	// are exhausted, then pass according to the rule below.
	// * If a struct could never be passed in registers or directly in a stack
	// slot (as it is larger than 2*XLEN and the floating point rules don't
	// apply), then pass it using a pointer with the byval attribute.
	// * If a struct is less than 2*XLEN, then coerce to either a two-element
	// word-sized array or a 2*XLEN scalar (depending on alignment).
	// * The frontend can determine whether a struct is returned by reference or
	// not based on its size and fields. If it will be returned by reference, the
	// frontend must modify the prototype so a pointer with the sret annotation is
	// passed as the first argument. This is not necessary for large scalar
	// returns.
	// * Struct return values and varargs should be coerced to structs containing
	// register-size fields in the same situations they would be for fixed
	// arguments.

	static const MCPhysReg ArgGPRs[] = {
	RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13,
	RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
	};
	static const MCPhysReg ArgFPR32s[] = {
	RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32,
	RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32
	};
	static const MCPhysReg ArgFPR64s[] = {
	RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64,
	RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64
	};

	// Pass a 2*XLEN argument that has been split into two XLEN values through
	// registers or the stack as necessary.
	static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
	ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2,
	MVT ValVT2, MVT LocVT2,
	ISD::ArgFlagsTy ArgFlags2) {
	unsigned XLenInBytes = XLen / 8;
	if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
	// At least one half can be passed via register.
	State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
	VA1.getLocVT(), CCValAssign::Full));
	} else {
	// Both halves must be passed on the stack, with proper alignment.
	unsigned StackAlign = std::max(XLenInBytes, ArgFlags1.getOrigAlign());
	State.addLoc(
	CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(),
	State.AllocateStack(XLenInBytes, StackAlign),
	VA1.getLocVT(), CCValAssign::Full));
	State.addLoc(CCValAssign::getMem(
	ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2,
	CCValAssign::Full));
	return false;
	}

	if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
	// The second half can also be passed via register.
	State.addLoc(
	CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
	} else {
	// The second half is passed via the stack, without additional alignment.
	State.addLoc(CCValAssign::getMem(
	ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2,
	CCValAssign::Full));
	}

	return false;
	}

	// Implements the RISC-V calling convention. Returns true upon failure.
	static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
	MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
	bool IsRet, Type *OrigTy) {
	unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
	assert(XLen == 32 \|\| XLen == 64);
	MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;

	// Any return value split in to more than two values can't be returned
	// directly.
	if (IsRet && ValNo > 1)
	return true;

	// UseGPRForF32 if targeting one of the soft-float ABIs, if passing a
	// variadic argument, or if no F32 argument registers are available.
	bool UseGPRForF32 = true;
	// UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
	// variadic argument, or if no F64 argument registers are available.
	bool UseGPRForF64 = true;

	switch (ABI) {
	default:
	llvm_unreachable("Unexpected ABI");
	case RISCVABI::ABI_ILP32:
	case RISCVABI::ABI_LP64:
	break;
	case RISCVABI::ABI_ILP32F:
	case RISCVABI::ABI_LP64F:
	UseGPRForF32 = !IsFixed;
	break;
	case RISCVABI::ABI_ILP32D:
	case RISCVABI::ABI_LP64D:
	UseGPRForF32 = !IsFixed;
	UseGPRForF64 = !IsFixed;
	break;
	}

	if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s))
	UseGPRForF32 = true;
	if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s))
	UseGPRForF64 = true;

	// From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local
	// variables rather than directly checking against the target ABI.

	if (UseGPRForF32 && ValVT == MVT::f32) {
	LocVT = XLenVT;
	LocInfo = CCValAssign::BCvt;
	} else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
	LocVT = MVT::i64;
	LocInfo = CCValAssign::BCvt;
	}

	// If this is a variadic argument, the RISC-V calling convention requires
	// that it is assigned an 'even' or 'aligned' register if it has 8-byte
	// alignment (RV32) or 16-byte alignment (RV64). An aligned register should
	// be used regardless of whether the original argument was split during
	// legalisation or not. The argument will not be passed by registers if the
	// original type is larger than 2*XLEN, so the register alignment rule does
	// not apply.
	unsigned TwoXLenInBytes = (2 * XLen) / 8;
	if (!IsFixed && ArgFlags.getOrigAlign() == TwoXLenInBytes &&
	DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) {
	unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
	// Skip 'odd' register if necessary.
	if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1)
	State.AllocateReg(ArgGPRs);
	}

	SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
	SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
	State.getPendingArgFlags();

	assert(PendingLocs.size() == PendingArgFlags.size() &&
	"PendingLocs and PendingArgFlags out of sync");

	// Handle passing f64 on RV32D with a soft float ABI or when floating point
	// registers are exhausted.
	if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) {
	assert(!ArgFlags.isSplit() && PendingLocs.empty() &&
	"Can't lower f64 if it is split");
	// Depending on available argument GPRS, f64 may be passed in a pair of
	// GPRs, split between a GPR and the stack, or passed completely on the
	// stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
	// cases.
	unsigned Reg = State.AllocateReg(ArgGPRs);
	LocVT = MVT::i32;
	if (!Reg) {
	unsigned StackOffset = State.AllocateStack(8, 8);
	State.addLoc(
	CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
	return false;
	}
	if (!State.AllocateReg(ArgGPRs))
	State.AllocateStack(4, 4);
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}

	// Split arguments might be passed indirectly, so keep track of the pending
	// values.
	if (ArgFlags.isSplit() \|\| !PendingLocs.empty()) {
	LocVT = XLenVT;
	LocInfo = CCValAssign::Indirect;
	PendingLocs.push_back(
	CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
	PendingArgFlags.push_back(ArgFlags);
	if (!ArgFlags.isSplitEnd()) {
	return false;
	}
	}

	// If the split argument only had two elements, it should be passed directly
	// in registers or on the stack.
	if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) {
	assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()");
	// Apply the normal calling convention rules to the first half of the
	// split argument.
	CCValAssign VA = PendingLocs[0];
	ISD::ArgFlagsTy AF = PendingArgFlags[0];
	PendingLocs.clear();
	PendingArgFlags.clear();
	return CC_RISCVAssign2XLen(XLen, State, VA, AF, ValNo, ValVT, LocVT,
	ArgFlags);
	}

	// Allocate to a register if possible, or else a stack slot.
	unsigned Reg;
	if (ValVT == MVT::f32 && !UseGPRForF32)
	Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
	else if (ValVT == MVT::f64 && !UseGPRForF64)
	Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
	else
	Reg = State.AllocateReg(ArgGPRs);
	unsigned StackOffset = Reg ? 0 : State.AllocateStack(XLen / 8, XLen / 8);

	// If we reach this point and PendingLocs is non-empty, we must be at the
	// end of a split argument that must be passed indirectly.
	if (!PendingLocs.empty()) {
	assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()");
	assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()");

	for (auto &It : PendingLocs) {
	if (Reg)
	It.convertToReg(Reg);
	else
	It.convertToMem(StackOffset);
	State.addLoc(It);
	}
	PendingLocs.clear();
	PendingArgFlags.clear();
	return false;
	}

	assert((!UseGPRForF32 \|\| !UseGPRForF64 \|\| LocVT == XLenVT) &&
	"Expected an XLenVT at this stage");

	if (Reg) {
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}

	// When an f32 or f64 is passed on the stack, no bit-conversion is needed.
	if (ValVT == MVT::f32 \|\| ValVT == MVT::f64) {
	LocVT = ValVT;
	LocInfo = CCValAssign::Full;
	}
	State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
	return false;
	}

	void RISCVTargetLowering::analyzeInputArgs(
	MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
	unsigned NumArgs = Ins.size();
	FunctionType *FType = MF.getFunction().getFunctionType();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Ins[i].VT;
	ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;

	Type *ArgTy = nullptr;
	if (IsRet)
	ArgTy = FType->getReturnType();
	else if (Ins[i].isOrigArg())
	ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());

	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo, /IsRet=/true, IsRet, ArgTy)) {
	LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << '\n');
	llvm_unreachable(nullptr);
	}
	}
	}

	void RISCVTargetLowering::analyzeOutputArgs(
	MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet,
	CallLoweringInfo *CLI) const {
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; i++) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;

	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
	LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << "\n");
	llvm_unreachable(nullptr);
	}
	}
	}

	// Convert Val to a ValVT. Should not be called for CCValAssign::Indirect
	// values.
	static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
	const CCValAssign &VA, const SDLoc &DL) {
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
	Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
	break;
	}
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	}
	return Val;
	}

	// The caller is responsible for loading the full value if the argument is
	// passed with CCValAssign::Indirect.
	static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	EVT LocVT = VA.getLocVT();
	SDValue Val;
	const TargetRegisterClass *RC;

	switch (LocVT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("Unexpected register type");
	case MVT::i32:
	case MVT::i64:
	RC = &RISCV::GPRRegClass;
	break;
	case MVT::f32:
	RC = &RISCV::FPR32RegClass;
	break;
	case MVT::f64:
	RC = &RISCV::FPR64RegClass;
	break;
	}

	unsigned VReg = RegInfo.createVirtualRegister(RC);
	RegInfo.addLiveIn(VA.getLocReg(), VReg);
	Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);

	if (VA.getLocInfo() == CCValAssign::Indirect)
	return Val;

	return convertLocVTToValVT(DAG, Val, VA, DL);
	}

	static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
	const CCValAssign &VA, const SDLoc &DL) {
	EVT LocVT = VA.getLocVT();

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
	Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
	break;
	}
	Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
	break;
	}
	return Val;
	}

	// The caller is responsible for loading the full value if the argument is
	// passed with CCValAssign::Indirect.
	static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT LocVT = VA.getLocVT();
	EVT ValVT = VA.getValVT();
	EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val;

	ISD::LoadExtType ExtType;
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	case CCValAssign::Indirect:
	case CCValAssign::BCvt:
	ExtType = ISD::NON_EXTLOAD;
	break;
	}
	Val = DAG.getExtLoad(
	ExtType, DL, LocVT, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT);
	return Val;
	}

	static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 &&
	"Unexpected VA");
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();

	if (VA.isMemLoc()) {
	// f64 is passed on the stack.
	int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
	return DAG.getLoad(MVT::f64, DL, Chain, FIN,
	MachinePointerInfo::getFixedStack(MF, FI));
	}

	assert(VA.isRegLoc() && "Expected register VA assignment");

	unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
	SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
	SDValue Hi;
	if (VA.getLocReg() == RISCV::X17) {
	// Second half of f64 is passed on the stack.
	int FI = MFI.CreateFixedObject(4, 0, /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
	Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
	MachinePointerInfo::getFixedStack(MF, FI));
	} else {
	// Second half of f64 is passed in another GPR.
	unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
	Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
	}
	return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
	}

	// Transform physical registers into virtual registers.
	SDValue RISCVTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

	switch (CallConv) {
	default:
	report_fatal_error("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::Fast:
	break;
	}

	MachineFunction &MF = DAG.getMachineFunction();

	const Function &Func = MF.getFunction();
	if (Func.hasFnAttribute("interrupt")) {
	if (!Func.arg_empty())
	report_fatal_error(
	"Functions with the interrupt attribute cannot have arguments!");

	StringRef Kind =
	MF.getFunction().getFnAttribute("interrupt").getValueAsString();

	if (!(Kind == "user" \|\| Kind == "supervisor" \|\| Kind == "machine"))
	report_fatal_error(
	"Function interrupt attribute argument not supported!");
	}

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	MVT XLenVT = Subtarget.getXLenVT();
	unsigned XLenInBytes = Subtarget.getXLen() / 8;
	// Used with vargs to acumulate store chains.
	std::vector<SDValue> OutChains;

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
	analyzeInputArgs(MF, CCInfo, Ins, /IsRet=/false);

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue ArgValue;
	// Passing f64 on RV32D with a soft float ABI must be handled as a special
	// case.
	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
	ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
	else if (VA.isRegLoc())
	ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
	else
	ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);

	if (VA.getLocInfo() == CCValAssign::Indirect) {
	// If the original argument was split and passed by reference (e.g. i128
	// on RV32), we need to load all parts of it here (using the same
	// address).
	InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
	MachinePointerInfo()));
	unsigned ArgIndex = Ins[i].OrigArgIndex;
	assert(Ins[i].PartOffset == 0);
	while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) {
	CCValAssign &PartVA = ArgLocs[i + 1];
	unsigned PartOffset = Ins[i + 1].PartOffset;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
	DAG.getIntPtrConstant(PartOffset, DL));
	InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
	MachinePointerInfo()));
	++i;
	}
	continue;
	}
	InVals.push_back(ArgValue);
	}

	if (IsVarArg) {
	ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(ArgGPRs);
	unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
	const TargetRegisterClass *RC = &RISCV::GPRRegClass;
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();

	// Offset of the first variable argument from stack pointer, and size of
	// the vararg save area. For now, the varargs save area is either zero or
	// large enough to hold a0-a7.
	int VaArgOffset, VarArgsSaveSize;

	// If all registers are allocated, then all varargs must be passed on the
	// stack and we don't need to save any argregs.
	if (ArgRegs.size() == Idx) {
	VaArgOffset = CCInfo.getNextStackOffset();
	VarArgsSaveSize = 0;
	} else {
	VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx);
	VaArgOffset = -VarArgsSaveSize;
	}

	// Record the frame index of the first variable argument
	// which is a value necessary to VASTART.
	int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
	RVFI->setVarArgsFrameIndex(FI);

	// If saving an odd number of registers then create an extra stack slot to
	// ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
	// offsets to even-numbered registered remain 2*XLEN-aligned.
	if (Idx % 2) {
	FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes,
	true);
	VarArgsSaveSize += XLenInBytes;
	}

	// Copy the integer registers that may have been used for passing varargs
	// to the vararg save area.
	for (unsigned I = Idx; I < ArgRegs.size();
	++I, VaArgOffset += XLenInBytes) {
	const unsigned Reg = RegInfo.createVirtualRegister(RC);
	RegInfo.addLiveIn(ArgRegs[I], Reg);
	SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
	FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
	SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
	MachinePointerInfo::getFixedStack(MF, FI));
	cast<StoreSDNode>(Store.getNode())
	->getMemOperand()
	->setValue((Value *)nullptr);
	OutChains.push_back(Store);
	}
	RVFI->setVarArgsSaveSize(VarArgsSaveSize);
	}

	// All stores are grouped in one node to allow the matching between
	// the size of Ins and InVals. This only happens for vararg functions.
	if (!OutChains.empty()) {
	OutChains.push_back(Chain);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
	}

	return Chain;
	}

	/// isEligibleForTailCallOptimization - Check whether the call is eligible
	/// for tail call optimization.
	/// Note: This is modelled after ARM's IsEligibleForTailCallOptimization.
	bool RISCVTargetLowering::isEligibleForTailCallOptimization(
	CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
	const SmallVector<CCValAssign, 16> &ArgLocs) const {

	auto &Callee = CLI.Callee;
	auto CalleeCC = CLI.CallConv;
	auto IsVarArg = CLI.IsVarArg;
	auto &Outs = CLI.Outs;
	auto &Caller = MF.getFunction();
	auto CallerCC = Caller.getCallingConv();

	// Do not tail call opt functions with "disable-tail-calls" attribute.
	if (Caller.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
	return false;

	// Exception-handling functions need a special set of instructions to
	// indicate a return to the hardware. Tail-calling another function would
	// probably break this.
	// TODO: The "interrupt" attribute isn't currently defined by RISC-V. This
	// should be expanded as new function attributes are introduced.
	if (Caller.hasFnAttribute("interrupt"))
	return false;

	// Do not tail call opt functions with varargs.
	if (IsVarArg)
	return false;

	// Do not tail call opt if the stack is used to pass parameters.
	if (CCInfo.getNextStackOffset() != 0)
	return false;

	// Do not tail call opt if any parameters need to be passed indirectly.
	// Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
	// passed indirectly. So the address of the value will be passed in a
	// register, or if not available, then the address is put on the stack. In
	// order to pass indirectly, space on the stack often needs to be allocated
	// in order to store the value. In this case the CCInfo.getNextStackOffset()
	// != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
	// are passed CCValAssign::Indirect.
	for (auto &VA : ArgLocs)
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;

	// Do not tail call opt if either caller or callee uses struct return
	// semantics.
	auto IsCallerStructRet = Caller.hasStructRetAttr();
	auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
	if (IsCallerStructRet \|\| IsCalleeStructRet)
	return false;

	// Externally-defined functions with weak linkage should not be
	// tail-called. The behaviour of branch instructions in this situation (as
	// used for tail calls) is implementation-defined, so we cannot rely on the
	// linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	if (GV->hasExternalWeakLinkage())
	return false;
	}

	// The callee has to preserve all registers the caller needs to preserve.
	const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (CalleeCC != CallerCC) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible
	// but less efficient and uglier in LowerCall.
	for (auto &Arg : Outs)
	if (Arg.Flags.isByVal())
	return false;

	return true;
	}

	// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
	// and output parameter nodes.
	SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	MVT XLenVT = Subtarget.getXLenVT();

	MachineFunction &MF = DAG.getMachineFunction();

	// Analyze the operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
	analyzeOutputArgs(MF, ArgCCInfo, Outs, /IsRet=/false, &CLI);

	// Check if it's really possible to do a tail call.
	if (IsTailCall)
	IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs);

	if (IsTailCall)
	++NumTailCalls;
	else if (CLI.CS && CLI.CS.isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = ArgCCInfo.getNextStackOffset();

	// Create local copies for byval args
	SmallVector<SDValue, 8> ByValArgs;
	for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (!Flags.isByVal())
	continue;

	SDValue Arg = OutVals[i];
	unsigned Size = Flags.getByValSize();
	unsigned Align = Flags.getByValAlign();

	int FI = MF.getFrameInfo().CreateStackObject(Size, Align, /isSS=/false);
	SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT);

	Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
	/IsVolatile=/false,
	/AlwaysInline=/false,
	IsTailCall, MachinePointerInfo(),
	MachinePointerInfo());
	ByValArgs.push_back(FIPtr);
	}

	if (!IsTailCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);

	// Copy argument values to their designated locations.
	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;
	for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue ArgValue = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	// Handle passing f64 on RV32D with a soft float ABI as a special case.
	bool IsF64OnRV32DSoftABI =
	VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
	if (IsF64OnRV32DSoftABI && VA.isRegLoc()) {
	SDValue SplitF64 = DAG.getNode(
	RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
	SDValue Lo = SplitF64.getValue(0);
	SDValue Hi = SplitF64.getValue(1);

	unsigned RegLo = VA.getLocReg();
	RegsToPass.push_back(std::make_pair(RegLo, Lo));

	if (RegLo == RISCV::X17) {
	// Second half of f64 is passed on the stack.
	// Work out the address of the stack slot.
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
	// Emit the store.
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
	} else {
	// Second half of f64 is passed in another GPR.
	unsigned RegHigh = RegLo + 1;
	RegsToPass.push_back(std::make_pair(RegHigh, Hi));
	}
	continue;
	}

	// IsF64OnRV32DSoftABI && VA.isMemLoc() is handled below in the same way
	// as any other MemLoc.

	// Promote the value if needed.
	// For now, only handle fully promoted and indirect arguments.
	if (VA.getLocInfo() == CCValAssign::Indirect) {
	// Store the argument in a stack slot and pass its address.
	SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT);
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, ArgValue, SpillSlot,
	MachinePointerInfo::getFixedStack(MF, FI)));
	// If the original argument was split (e.g. i128), we need
	// to store all parts of it here (and pass just one address).
	unsigned ArgIndex = Outs[i].OrigArgIndex;
	assert(Outs[i].PartOffset == 0);
	while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) {
	SDValue PartValue = OutVals[i + 1];
	unsigned PartOffset = Outs[i + 1].PartOffset;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
	DAG.getIntPtrConstant(PartOffset, DL));
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, PartValue, Address,
	MachinePointerInfo::getFixedStack(MF, FI)));
	++i;
	}
	ArgValue = SpillSlot;
	} else {
	ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
	}

	// Use local copy if it is a byval arg.
	if (Flags.isByVal())
	ArgValue = ByValArgs[j++];

	if (VA.isRegLoc()) {
	// Queue up the argument copies and emit them at the end.
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
	} else {
	assert(VA.isMemLoc() && "Argument not register or memory");
	assert(!IsTailCall && "Tail call not allowed if stack is used "
	"for passing parameters");

	// Work out the address of the stack slot.
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
	SDValue Address =
	DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
	DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));

	// Emit the store.
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
	}
	}

	// Join the stores, which are independent of one another.
	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	SDValue Glue;

	// Build a sequence of copy-to-reg nodes, chained and glued together.
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
	Glue = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
	// TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
	// split it and then direct call can be matched by PseudoCALL.
	if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = S->getGlobal();

	unsigned OpFlags = RISCVII::MO_CALL;
	if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))
	OpFlags = RISCVII::MO_PLT;

	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	unsigned OpFlags = RISCVII::MO_CALL;

	if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(),
	nullptr))
	OpFlags = RISCVII::MO_PLT;

	Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
	}

	// The first call operand is the chain and the second is the target address.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	// Add argument registers to the end of the list so that they are
	// known live into the call.
	for (auto &Reg : RegsToPass)
	Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));

	if (!IsTailCall) {
	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	// Glue the call to the argument copies, if any.
	if (Glue.getNode())
	Ops.push_back(Glue);

	// Emit the call.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops);
	}

	Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
	Glue = Chain.getValue(1);

	// Mark the end of the call, which is glued to the call itself.
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getConstant(NumBytes, DL, PtrVT, true),
	DAG.getConstant(0, DL, PtrVT, true),
	Glue, DL);
	Glue = Chain.getValue(1);

	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
	analyzeInputArgs(MF, RetCCInfo, Ins, /IsRet=/true);

	// Copy all of the result registers out of their specified physreg.
	for (auto &VA : RVLocs) {
	// Copy the value out
	SDValue RetValue =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
	// Glue the RetValue to the end of the call sequence
	Chain = RetValue.getValue(1);
	Glue = RetValue.getValue(2);

	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
	assert(VA.getLocReg() == ArgGPRs[0] && "Unexpected reg assignment");
	SDValue RetValue2 =
	DAG.getCopyFromReg(Chain, DL, ArgGPRs[1], MVT::i32, Glue);
	Chain = RetValue2.getValue(1);
	Glue = RetValue2.getValue(2);
	RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue,
	RetValue2);
	}

	RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);

	InVals.push_back(RetValue);
	}

	return Chain;
	}

	bool RISCVTargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
	for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
	MVT VT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
	ArgFlags, CCInfo, /IsFixed=/true, /IsRet=/true, nullptr))
	return false;
	}
	return true;
	}

	SDValue
	RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	// Stores the assignment of the return value to a location.
	SmallVector<CCValAssign, 16> RVLocs;

	// Info about the registers and stack slot.
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /IsRet=/true,
	nullptr);

	SDValue Glue;
	SmallVector<SDValue, 4> RetOps(1, Chain);

	// Copy the result values into the output registers.
	for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) {
	SDValue Val = OutVals[i];
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
	// Handle returning f64 on RV32D with a soft float ABI.
	assert(VA.isRegLoc() && "Expected return via registers");
	SDValue SplitF64 = DAG.getNode(RISCVISD::SplitF64, DL,
	DAG.getVTList(MVT::i32, MVT::i32), Val);
	SDValue Lo = SplitF64.getValue(0);
	SDValue Hi = SplitF64.getValue(1);
	unsigned RegLo = VA.getLocReg();
	unsigned RegHi = RegLo + 1;
	Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
	Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
	} else {
	// Handle a 'normal' return.
	Val = convertValVTToLocVT(DAG, Val, VA, DL);
	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);

	// Guarantee that all emitted copies are stuck together.
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the glue node if we have it.
	if (Glue.getNode()) {
	RetOps.push_back(Glue);
	}

	// Interrupt service routines use different return instructions.
	const Function &Func = DAG.getMachineFunction().getFunction();
	if (Func.hasFnAttribute("interrupt")) {
	if (!Func.getReturnType()->isVoidTy())
	report_fatal_error(
	"Functions with the interrupt attribute must have void return type!");

	MachineFunction &MF = DAG.getMachineFunction();
	StringRef Kind =
	MF.getFunction().getFnAttribute("interrupt").getValueAsString();

	unsigned RetOpc;
	if (Kind == "user")
	RetOpc = RISCVISD::URET_FLAG;
	else if (Kind == "supervisor")
	RetOpc = RISCVISD::SRET_FLAG;
	else
	RetOpc = RISCVISD::MRET_FLAG;

	return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
	}

	return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((RISCVISD::NodeType)Opcode) {
	case RISCVISD::FIRST_NUMBER:
	break;
	case RISCVISD::RET_FLAG:
	return "RISCVISD::RET_FLAG";
	case RISCVISD::URET_FLAG:
	return "RISCVISD::URET_FLAG";
	case RISCVISD::SRET_FLAG:
	return "RISCVISD::SRET_FLAG";
	case RISCVISD::MRET_FLAG:
	return "RISCVISD::MRET_FLAG";
	case RISCVISD::CALL:
	return "RISCVISD::CALL";
	case RISCVISD::SELECT_CC:
	return "RISCVISD::SELECT_CC";
	case RISCVISD::BuildPairF64:
	return "RISCVISD::BuildPairF64";
	case RISCVISD::SplitF64:
	return "RISCVISD::SplitF64";
	case RISCVISD::TAIL:
	return "RISCVISD::TAIL";
	case RISCVISD::SLLW:
	return "RISCVISD::SLLW";
	case RISCVISD::SRAW:
	return "RISCVISD::SRAW";
	case RISCVISD::SRLW:
	return "RISCVISD::SRLW";
	case RISCVISD::DIVW:
	return "RISCVISD::DIVW";
	case RISCVISD::DIVUW:
	return "RISCVISD::DIVUW";
	case RISCVISD::REMUW:
	return "RISCVISD::REMUW";
	case RISCVISD::FMV_W_X_RV64:
	return "RISCVISD::FMV_W_X_RV64";
	case RISCVISD::FMV_X_ANYEXTW_RV64:
	return "RISCVISD::FMV_X_ANYEXTW_RV64";
	case RISCVISD::READ_CYCLE_WIDE:
	return "RISCVISD::READ_CYCLE_WIDE";
	}
	return nullptr;
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	RISCVTargetLowering::ConstraintType
	RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default:
	break;
	case 'f':
	return C_RegisterClass;
	case 'I':
	case 'J':
	case 'K':
	return C_Immediate;
	+ case 'A':
	+ return C_Memory;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to a
	// RISCV register class.
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	return std::make_pair(0U, &RISCV::GPRRegClass);
	case 'f':
	if (Subtarget.hasStdExtF() && VT == MVT::f32)
	return std::make_pair(0U, &RISCV::FPR32RegClass);
	if (Subtarget.hasStdExtD() && VT == MVT::f64)
	return std::make_pair(0U, &RISCV::FPR64RegClass);
	break;
	default:
	break;
	}
	}

	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
	+}
	+
	+unsigned
	+RISCVTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
	+ // Currently only support length 1 constraints.
	+ if (ConstraintCode.size() == 1) {
	+ switch (ConstraintCode[0]) {
	+ case 'A':
	+ return InlineAsm::Constraint_A;
	+ default:
	+ break;
	+ }
	+ }
	+
	+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	void RISCVTargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	// Currently only support length 1 constraints.
	if (Constraint.length() == 1) {
	switch (Constraint[0]) {
	case 'I':
	// Validate & create a 12-bit signed immediate operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	uint64_t CVal = C->getSExtValue();
	if (isInt<12>(CVal))
	Ops.push_back(
	DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
	}
	return;
	case 'J':
	// Validate & create an integer zero operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op))
	if (C->getZExtValue() == 0)
	Ops.push_back(
	DAG.getTargetConstant(0, SDLoc(Op), Subtarget.getXLenVT()));
	return;
	case 'K':
	// Validate & create a 5-bit unsigned immediate operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	uint64_t CVal = C->getZExtValue();
	if (isUInt<5>(CVal))
	Ops.push_back(
	DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
	}
	return;
	default:
	break;
	}
	}
	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
	return Builder.CreateFence(Ord);
	if (isa<StoreInst>(Inst) && isReleaseOrStronger(Ord))
	return Builder.CreateFence(AtomicOrdering::Release);
	return nullptr;
	}

	Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (isa<LoadInst>(Inst) && isAcquireOrStronger(Ord))
	return Builder.CreateFence(AtomicOrdering::Acquire);
	return nullptr;
	}

	TargetLowering::AtomicExpansionKind
	RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	// atomicrmw {fadd,fsub} must be expanded to use compare-exchange, as floating
	// point operations can't be used in an lr/sc sequence without breaking the
	// forward-progress guarantee.
	if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size == 8 \|\| Size == 16)
	return AtomicExpansionKind::MaskedIntrinsic;
	return AtomicExpansionKind::None;
	}

	static Intrinsic::ID
	getIntrinsicForMaskedAtomicRMWBinOp(unsigned XLen, AtomicRMWInst::BinOp BinOp) {
	if (XLen == 32) {
	switch (BinOp) {
	default:
	llvm_unreachable("Unexpected AtomicRMW BinOp");
	case AtomicRMWInst::Xchg:
	return Intrinsic::riscv_masked_atomicrmw_xchg_i32;
	case AtomicRMWInst::Add:
	return Intrinsic::riscv_masked_atomicrmw_add_i32;
	case AtomicRMWInst::Sub:
	return Intrinsic::riscv_masked_atomicrmw_sub_i32;
	case AtomicRMWInst::Nand:
	return Intrinsic::riscv_masked_atomicrmw_nand_i32;
	case AtomicRMWInst::Max:
	return Intrinsic::riscv_masked_atomicrmw_max_i32;
	case AtomicRMWInst::Min:
	return Intrinsic::riscv_masked_atomicrmw_min_i32;
	case AtomicRMWInst::UMax:
	return Intrinsic::riscv_masked_atomicrmw_umax_i32;
	case AtomicRMWInst::UMin:
	return Intrinsic::riscv_masked_atomicrmw_umin_i32;
	}
	}

	if (XLen == 64) {
	switch (BinOp) {
	default:
	llvm_unreachable("Unexpected AtomicRMW BinOp");
	case AtomicRMWInst::Xchg:
	return Intrinsic::riscv_masked_atomicrmw_xchg_i64;
	case AtomicRMWInst::Add:
	return Intrinsic::riscv_masked_atomicrmw_add_i64;
	case AtomicRMWInst::Sub:
	return Intrinsic::riscv_masked_atomicrmw_sub_i64;
	case AtomicRMWInst::Nand:
	return Intrinsic::riscv_masked_atomicrmw_nand_i64;
	case AtomicRMWInst::Max:
	return Intrinsic::riscv_masked_atomicrmw_max_i64;
	case AtomicRMWInst::Min:
	return Intrinsic::riscv_masked_atomicrmw_min_i64;
	case AtomicRMWInst::UMax:
	return Intrinsic::riscv_masked_atomicrmw_umax_i64;
	case AtomicRMWInst::UMin:
	return Intrinsic::riscv_masked_atomicrmw_umin_i64;
	}
	}

	llvm_unreachable("Unexpected XLen\n");
	}

	Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
	IRBuilder<> &Builder, AtomicRMWInst AI, Value AlignedAddr, Value *Incr,
	Value Mask, Value ShiftAmt, AtomicOrdering Ord) const {
	unsigned XLen = Subtarget.getXLen();
	Value *Ordering =
	Builder.getIntN(XLen, static_cast<uint64_t>(AI->getOrdering()));
	Type *Tys[] = {AlignedAddr->getType()};
	Function *LrwOpScwLoop = Intrinsic::getDeclaration(
	AI->getModule(),
	getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys);

	if (XLen == 64) {
	Incr = Builder.CreateSExt(Incr, Builder.getInt64Ty());
	Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
	ShiftAmt = Builder.CreateSExt(ShiftAmt, Builder.getInt64Ty());
	}

	Value *Result;

	// Must pass the shift amount needed to sign extend the loaded value prior
	// to performing a signed comparison for min/max. ShiftAmt is the number of
	// bits to shift the value into position. Pass XLen-ShiftAmt-ValWidth, which
	// is the number of bits to left+right shift the value in order to
	// sign-extend.
	if (AI->getOperation() == AtomicRMWInst::Min \|\|
	AI->getOperation() == AtomicRMWInst::Max) {
	const DataLayout &DL = AI->getModule()->getDataLayout();
	unsigned ValWidth =
	DL.getTypeStoreSizeInBits(AI->getValOperand()->getType());
	Value *SextShamt =
	Builder.CreateSub(Builder.getIntN(XLen, XLen - ValWidth), ShiftAmt);
	Result = Builder.CreateCall(LrwOpScwLoop,
	{AlignedAddr, Incr, Mask, SextShamt, Ordering});
	} else {
	Result =
	Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering});
	}

	if (XLen == 64)
	Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
	return Result;
	}

	TargetLowering::AtomicExpansionKind
	RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *CI) const {
	unsigned Size = CI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
	if (Size == 8 \|\| Size == 16)
	return AtomicExpansionKind::MaskedIntrinsic;
	return AtomicExpansionKind::None;
	}

	Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
	IRBuilder<> &Builder, AtomicCmpXchgInst CI, Value AlignedAddr,
	Value CmpVal, Value NewVal, Value *Mask, AtomicOrdering Ord) const {
	unsigned XLen = Subtarget.getXLen();
	Value *Ordering = Builder.getIntN(XLen, static_cast<uint64_t>(Ord));
	Intrinsic::ID CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i32;
	if (XLen == 64) {
	CmpVal = Builder.CreateSExt(CmpVal, Builder.getInt64Ty());
	NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty());
	Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
	CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64;
	}
	Type *Tys[] = {AlignedAddr->getType()};
	Function *MaskedCmpXchg =
	Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
	Value *Result = Builder.CreateCall(
	MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
	if (XLen == 64)
	Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
	return Result;
	}

	unsigned RISCVTargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	return RISCV::X10;
	}

	unsigned RISCVTargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	return RISCV::X11;
	}
	Index: projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/RISCV/RISCVISelLowering.h (revision 351722)
	@@ -1,211 +1,214 @@
	//===-- RISCVISelLowering.h - RISCV DAG Lowering Interface ------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that RISCV uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H
	#define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H

	#include "RISCV.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLowering.h"

	namespace llvm {
	class RISCVSubtarget;
	namespace RISCVISD {
	enum NodeType : unsigned {
	FIRST_NUMBER = ISD::BUILTIN_OP_END,
	RET_FLAG,
	URET_FLAG,
	SRET_FLAG,
	MRET_FLAG,
	CALL,
	SELECT_CC,
	BuildPairF64,
	SplitF64,
	TAIL,
	// RV64I shifts, directly matching the semantics of the named RISC-V
	// instructions.
	SLLW,
	SRAW,
	SRLW,
	// 32-bit operations from RV64M that can't be simply matched with a pattern
	// at instruction selection time.
	DIVW,
	DIVUW,
	REMUW,
	// FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast
	// is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X.
	// FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result.
	// This is a more convenient semantic for producing dagcombines that remove
	// unnecessary GPR->FPR->GPR moves.
	FMV_W_X_RV64,
	FMV_X_ANYEXTW_RV64,
	// READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
	// (returns (Lo, Hi)). It takes a chain operand.
	READ_CYCLE_WIDE
	};
	}

	class RISCVTargetLowering : public TargetLowering {
	const RISCVSubtarget &Subtarget;

	public:
	explicit RISCVTargetLowering(const TargetMachine &TM,
	const RISCVSubtarget &STI);

	bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const override;
	bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I = nullptr) const override;
	bool isLegalICmpImmediate(int64_t Imm) const override;
	bool isLegalAddImmediate(int64_t Imm) const override;
	bool isTruncateFree(Type SrcTy, Type DstTy) const override;
	bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
	bool isZExtFree(SDValue Val, EVT VT2) const override;
	bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;

	bool hasBitPreservingFPLogic(EVT VT) const override;

	// Provide custom lowering hooks for some operations.
	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
	void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const override;

	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

	unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const override;

	// This method returns the name of a target specific DAG node.
	const char *getTargetNodeName(unsigned Opcode) const override;

	ConstraintType getConstraintType(StringRef Constraint) const override;
	+
	+ unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override;
	+
	std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const override;

	void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const override;

	MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const override;

	EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const override;

	bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
	return VT.isScalarInteger();
	}

	bool shouldInsertFencesForAtomic(const Instruction *I) const override {
	return isa<LoadInst>(I) \|\| isa<StoreInst>(I);
	}
	Instruction emitLeadingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const override;
	Instruction emitTrailingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const override;

	ISD::NodeType getExtendForAtomicOps() const override {
	return ISD::SIGN_EXTEND;
	}

	bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return false;
	return true;
	}
	bool isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const override;

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const override;

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const override;

	private:
	void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	bool IsRet) const;
	void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	bool IsRet, CallLoweringInfo *CLI) const;
	// Lower incoming arguments, copy physregs into vregs
	SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &DL, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const override;
	bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const override;
	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
	SelectionDAG &DAG) const override;
	SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const override;
	bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const override {
	return true;
	}

	template <class NodeTy>
	SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;

	SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
	bool UseGOT) const;
	SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;

	bool shouldConsiderGEPOffsetSplit() const override { return true; }
	SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;

	bool isEligibleForTailCallOptimization(
	CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
	const SmallVector<CCValAssign, 16> &ArgLocs) const;

	TargetLowering::AtomicExpansionKind
	shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
	virtual Value *emitMaskedAtomicRMWIntrinsic(
	IRBuilder<> &Builder, AtomicRMWInst AI, Value AlignedAddr, Value *Incr,
	Value Mask, Value ShiftAmt, AtomicOrdering Ord) const override;
	TargetLowering::AtomicExpansionKind
	shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override;
	virtual Value *
	emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder, AtomicCmpXchgInst *CI,
	Value AlignedAddr, Value CmpVal,
	Value NewVal, Value Mask,
	AtomicOrdering Ord) const override;
	};
	}

	#endif
	Index: projects/clang900-import/contrib/llvm/lib/Target/TargetMachine.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/TargetMachine.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/TargetMachine.cpp (revision 351722)
	@@ -1,282 +1,284 @@
	//===-- TargetMachine.cpp - General Target Information ---------------------==//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the general parts of a Target machine.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/Mangler.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/MC/MCSectionMachO.h"
	#include "llvm/MC/MCTargetOptions.h"
	#include "llvm/MC/SectionKind.h"
	#include "llvm/Target/TargetLoweringObjectFile.h"
	using namespace llvm;

	//---------------------------------------------------------------------------
	// TargetMachine Class
	//

	TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
	const Triple &TT, StringRef CPU, StringRef FS,
	const TargetOptions &Options)
	: TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
	TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
	RequireStructuredCFG(false), DefaultOptions(Options), Options(Options) {
	}

	TargetMachine::~TargetMachine() = default;

	bool TargetMachine::isPositionIndependent() const {
	return getRelocationModel() == Reloc::PIC_;
	}

	/// Reset the target options based on the function's attributes.
	// FIXME: This function needs to go away for a number of reasons:
	// a) global state on the TargetMachine is terrible in general,
	// b) these target options should be passed only on the function
	// and not on the TargetMachine (via TargetOptions) at all.
	void TargetMachine::resetTargetOptions(const Function &F) const {
	#define RESET_OPTION(X, Y) \
	do { \
	if (F.hasFnAttribute(Y)) \
	Options.X = (F.getFnAttribute(Y).getValueAsString() == "true"); \
	else \
	Options.X = DefaultOptions.X; \
	} while (0)

	RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
	RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
	RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
	RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
	RESET_OPTION(NoTrappingFPMath, "no-trapping-math");

	StringRef Denormal =
	F.getFnAttribute("denormal-fp-math").getValueAsString();
	if (Denormal == "ieee")
	Options.FPDenormalMode = FPDenormal::IEEE;
	else if (Denormal == "preserve-sign")
	Options.FPDenormalMode = FPDenormal::PreserveSign;
	else if (Denormal == "positive-zero")
	Options.FPDenormalMode = FPDenormal::PositiveZero;
	else
	Options.FPDenormalMode = DefaultOptions.FPDenormalMode;
	}

	/// Returns the code generation relocation model. The choices are static, PIC,
	/// and dynamic-no-pic.
	Reloc::Model TargetMachine::getRelocationModel() const { return RM; }

	/// Returns the code model. The choices are small, kernel, medium, large, and
	/// target default.
	CodeModel::Model TargetMachine::getCodeModel() const { return CMModel; }

	/// Get the IR-specified TLS model for Var.
	static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
	switch (GV->getThreadLocalMode()) {
	case GlobalVariable::NotThreadLocal:
	llvm_unreachable("getSelectedTLSModel for non-TLS variable");
	break;
	case GlobalVariable::GeneralDynamicTLSModel:
	return TLSModel::GeneralDynamic;
	case GlobalVariable::LocalDynamicTLSModel:
	return TLSModel::LocalDynamic;
	case GlobalVariable::InitialExecTLSModel:
	return TLSModel::InitialExec;
	case GlobalVariable::LocalExecTLSModel:
	return TLSModel::LocalExec;
	}
	llvm_unreachable("invalid TLS model");
	}

	bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
	const GlobalValue *GV) const {
	// If the IR producer requested that this GV be treated as dso local, obey.
	if (GV && GV->isDSOLocal())
	return true;

	// If we are not supossed to use a PLT, we cannot assume that intrinsics are
	// local since the linker can convert some direct access to access via plt.
	if (M.getRtLibUseGOT() && !GV)
	return false;

	// According to the llvm language reference, we should be able to
	// just return false in here if we have a GV, as we know it is
	// dso_preemptable. At this point in time, the various IR producers
	// have not been transitioned to always produce a dso_local when it
	// is possible to do so.
	// In the case of intrinsics, GV is null and there is nowhere to put
	// dso_local. Returning false for those will produce worse code in some
	// architectures. For example, on x86 the caller has to set ebx before calling
	// a plt.
	// As a result we still have some logic in here to improve the quality of the
	// generated code.
	// FIXME: Add a module level metadata for whether intrinsics should be assumed
	// local.

	Reloc::Model RM = getRelocationModel();
	const Triple &TT = getTargetTriple();

	// DLLImport explicitly marks the GV as external.
	if (GV && GV->hasDLLImportStorageClass())
	return false;

	// On MinGW, variables that haven't been declared with DLLImport may still
	// end up automatically imported by the linker. To make this feasible,
	// don't assume the variables to be DSO local unless we actually know
	// that for sure. This only has to be done for variables; for functions
	// the linker can insert thunks for calling functions from another DLL.
	- if (TT.isWindowsGNUEnvironment() && GV && GV->isDeclarationForLinker() &&
	- isa<GlobalVariable>(GV))
	+ if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() && GV &&
	+ GV->isDeclarationForLinker() && isa<GlobalVariable>(GV))
	return false;

	// On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols
	// remain unresolved in the link, they can be resolved to zero, which is
	// outside the current DSO.
	if (TT.isOSBinFormatCOFF() && GV && GV->hasExternalWeakLinkage())
	return false;

	// Every other GV is local on COFF.
	// Make an exception for windows OS in the triple: Some firmware builds use
	// *-win32-macho triples. This (accidentally?) produced windows relocations
	// without GOT tables in older clang versions; Keep this behaviour.
	- if (TT.isOSBinFormatCOFF() \|\| (TT.isOSWindows() && TT.isOSBinFormatMachO()))
	+ // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables
	+ // either.
	+ if (TT.isOSBinFormatCOFF() \|\| TT.isOSWindows())
	return true;

	// Most PIC code sequences that assume that a symbol is local cannot
	// produce a 0 if it turns out the symbol is undefined. While this
	// is ABI and relocation depended, it seems worth it to handle it
	// here.
	if (GV && isPositionIndependent() && GV->hasExternalWeakLinkage())
	return false;

	if (GV && !GV->hasDefaultVisibility())
	return true;

	if (TT.isOSBinFormatMachO()) {
	if (RM == Reloc::Static)
	return true;
	return GV && GV->isStrongDefinitionForLinker();
	}

	// Due to the AIX linkage model, any global with default visibility is
	// considered non-local.
	if (TT.isOSBinFormatXCOFF())
	return false;

	assert(TT.isOSBinFormatELF() \|\| TT.isOSBinFormatWasm());
	assert(RM != Reloc::DynamicNoPIC);

	bool IsExecutable =
	RM == Reloc::Static \|\| M.getPIELevel() != PIELevel::Default;
	if (IsExecutable) {
	// If the symbol is defined, it cannot be preempted.
	if (GV && !GV->isDeclarationForLinker())
	return true;

	// A symbol marked nonlazybind should not be accessed with a plt. If the
	// symbol turns out to be external, the linker will convert a direct
	// access to an access via the plt, so don't assume it is local.
	const Function *F = dyn_cast_or_null<Function>(GV);
	if (F && F->hasFnAttribute(Attribute::NonLazyBind))
	return false;

	bool IsTLS = GV && GV->isThreadLocal();
	bool IsAccessViaCopyRelocs =
	GV && Options.MCOptions.MCPIECopyRelocations && isa<GlobalVariable>(GV);
	Triple::ArchType Arch = TT.getArch();
	bool IsPPC =
	Arch == Triple::ppc \|\| Arch == Triple::ppc64 \|\| Arch == Triple::ppc64le;
	// Check if we can use copy relocations. PowerPC has no copy relocations.
	if (!IsTLS && !IsPPC && (RM == Reloc::Static \|\| IsAccessViaCopyRelocs))
	return true;
	}

	// ELF & wasm support preemption of other symbols.
	return false;
	}

	bool TargetMachine::useEmulatedTLS() const {
	// Returns Options.EmulatedTLS if the -emulated-tls or -no-emulated-tls
	// was specified explicitly; otherwise uses target triple to decide default.
	if (Options.ExplicitEmulatedTLS)
	return Options.EmulatedTLS;
	return getTargetTriple().hasDefaultEmulatedTLS();
	}

	TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
	bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
	Reloc::Model RM = getRelocationModel();
	bool IsSharedLibrary = RM == Reloc::PIC_ && !IsPIE;
	bool IsLocal = shouldAssumeDSOLocal(*GV->getParent(), GV);

	TLSModel::Model Model;
	if (IsSharedLibrary) {
	if (IsLocal)
	Model = TLSModel::LocalDynamic;
	else
	Model = TLSModel::GeneralDynamic;
	} else {
	if (IsLocal)
	Model = TLSModel::LocalExec;
	else
	Model = TLSModel::InitialExec;
	}

	// If the user specified a more specific model, use that.
	TLSModel::Model SelectedModel = getSelectedTLSModel(GV);
	if (SelectedModel > Model)
	return SelectedModel;

	return Model;
	}

	/// Returns the optimization level: None, Less, Default, or Aggressive.
	CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }

	void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }

	TargetTransformInfo TargetMachine::getTargetTransformInfo(const Function &F) {
	return TargetTransformInfo(F.getParent()->getDataLayout());
	}

	void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
	const GlobalValue *GV, Mangler &Mang,
	bool MayAlwaysUsePrivate) const {
	if (MayAlwaysUsePrivate \|\| !GV->hasPrivateLinkage()) {
	// Simple case: If GV is not private, it is not important to find out if
	// private labels are legal in this case or not.
	Mang.getNameWithPrefix(Name, GV, false);
	return;
	}
	const TargetLoweringObjectFile *TLOF = getObjFileLowering();
	TLOF->getNameWithPrefix(Name, GV, *this);
	}

	MCSymbol TargetMachine::getSymbol(const GlobalValue GV) const {
	const TargetLoweringObjectFile *TLOF = getObjFileLowering();
	SmallString<128> NameStr;
	getNameWithPrefix(NameStr, GV, TLOF->getMangler());
	return TLOF->getContext().getOrCreateSymbol(NameStr);
	}

	TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
	// Since Analysis can't depend on Target, use a std::function to invert the
	// dependency.
	return TargetIRAnalysis(
	[this](const Function &F) { return this->getTargetTransformInfo(F); });
	}
	Index: projects/clang900-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 351722)
	@@ -1,45520 +1,45530 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool> ExperimentalVectorWideningLegalization(
	"x86-experimental-vector-widening-legalization", cl::init(false),
	cl::desc("Enable an experimental vector type legalization through widening "
	"rather than promotion."),
	cl::Hidden);

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc("Sets the preferable loop alignment for experiments "
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
	setUseUnderscoreSetJmp(false);
	setUseUnderscoreLongJmp(false);
	} else if (Subtarget.isTargetWindowsGNU()) {
	// MS runtime is weird: it exports _setjmp, but longjmp!
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(false);
	} else {
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(true);
	}

	// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
	// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
	// FIXME: Should we be limitting the atomic size on other configs? Default is
	// 1024.
	if (!Subtarget.hasCmpxchg8b())
	setMaxAtomicSizeInBitsSupported(32);

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

	// Integer absolute.
	if (Subtarget.hasCMov()) {
	setOperationAction(ISD::ABS , MVT::i16 , Custom);
	setOperationAction(ISD::ABS , MVT::i32 , Custom);
	}
	setOperationAction(ISD::ABS , MVT::i64 , Custom);

	// Funnel shifts.
	for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
	setOperationAction(ShiftOp , MVT::i16 , Custom);
	setOperationAction(ShiftOp , MVT::i32 , Custom);
	if (Subtarget.is64Bit())
	setOperationAction(ShiftOp , MVT::i64 , Custom);
	}

	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
	// f32/f64 are legal, f80 is custom.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	else
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	} else if (!Subtarget.useSoftFloat()) {
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
	}

	// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// SSE has no i16 to fp conversion, only i32.
	if (X86ScalarSSEf32) {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
	}

	// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
	}

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
	}
	} else if (!Subtarget.useSoftFloat()) {
	// Since AVX is a superset of SSE3, only check for SSE here.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
	// Expand FP_TO_UINT into a select.
	// FIXME: We would like to use a Custom expander here eventually to do
	// the optimal thing for SSE vs. the default expansion in the legalizer.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
	else
	// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
	// With SSE3 we can use fisttpll to convert to a signed i64; without
	// SSE, we're stuck with a fistpll.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	}

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
	setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
	}
	}

	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	if (Subtarget.useSoftFloat() \|\| !Subtarget.hasF16C()) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}

	// There's never any support for operations beyond MVT::f32.
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	else
	setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// These should be promoted to a larger select which is supported.
	setOperationAction(ISD::SELECT , MVT::i1 , Promote);
	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}

	// Custom action for SELECT MMX and expand action for SELECT_CC MMX
	setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
	// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSEPrefetch() \|\| Subtarget.has3DNow())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (!Subtarget.is64Bit())
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	// Disable f32->f64 extload as we can only generate this in one instruction
	// under optsize. So its easier to pattern match (fpext (load)) for that
	// case instead of needing to emit 2 instructions for extload in the
	// non-optsize case.
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::FADD, VT, Custom);
	setOperationAction(ISD::FSUB, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	} else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 \|\| Is64Bit)) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, &X86::FR32RegClass);
	if (UseX87)
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	if (UseX87)
	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	if (UseX87)
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	if (UseX87) {
	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	}
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	}

	// Expand FP32 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f32)) {
	if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	}
	// Expand FP64 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f64)) {
	if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	}

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// Long double always uses X87, except f128 in MMX.
	if (UseX87) {
	if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
	setOperationAction(ISD::FABS , MVT::f128, Custom);
	setOperationAction(ISD::FNEG , MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
	}

	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	setOperationAction(ISD::LROUND, MVT::f80, Expand);
	setOperationAction(ISD::LLROUND, MVT::f80, Expand);
	setOperationAction(ISD::LRINT, MVT::f80, Expand);
	setOperationAction(ISD::LLRINT, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

	setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
	setOperationAction(ISD::STORE, MVT::v2f32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
	MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
	setOperationAction(ISD::SDIV, VT, Custom);
	setOperationAction(ISD::SREM, VT, Custom);
	setOperationAction(ISD::UDIV, VT, Custom);
	setOperationAction(ISD::UREM, VT, Custom);
	}

	setOperationAction(ISD::MUL, MVT::v2i8, Custom);
	setOperationAction(ISD::MUL, MVT::v2i16, Custom);
	setOperationAction(ISD::MUL, MVT::v2i32, Custom);
	setOperationAction(ISD::MUL, MVT::v4i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i16, Custom);
	setOperationAction(ISD::MUL, MVT::v8i8, Custom);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
	}

	setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

	if (!ExperimentalVectorWideningLegalization) {
	// Use widening instead of promotion.
	for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
	MVT::v4i16, MVT::v2i16 }) {
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);
	}
	}

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	// Provide custom widening for v2f32 setcc. This is really for VLX when
	// setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
	// type legalization changing the result type to v4i1 during widening.
	// It works fine for SSE2 and is probably faster so no need to qualify with
	// VLX support.
	setOperationAction(ISD::SETCC, MVT::v2i32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ABS, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// We support custom legalizing of sext and anyext loads for specific
	// memory vector types which we can load as a scalar (or sequence of
	// scalars) and extend in-register to a legal 128-bit vector type. For sext
	// loads these must work with a single scalar load.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);

	// Custom legalize these to avoid over promotion or custom promotion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);

	// By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
	// promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
	// split again based on the input type, this will cause an AssertSExt i16 to
	// be emitted instead of an AssertZExt. This will allow packssdw followed by
	// packuswb to be used to truncate to v8i8. This is necessary since packusdw
	// isn't available until sse4.1.
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

	// We want to legalize this to an f64 load rather than an i64 load on
	// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
	// store.
	setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
	setOperationAction(ISD::STORE, MVT::v2i32, Custom);
	setOperationAction(ISD::STORE, MVT::v4i16, Custom);
	setOperationAction(ISD::STORE, MVT::v8i8, Custom);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	if (ExperimentalVectorWideningLegalization) {
	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

	setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
	} else {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
	}

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v8i16, Custom);

	// With AVX512, expanding (and promoting the shifts) is better.
	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::ADD, MVT::i16, Custom);
	setOperationAction(ISD::ADD, MVT::i32, Custom);
	setOperationAction(ISD::SUB, MVT::i16, Custom);
	setOperationAction(ISD::SUB, MVT::i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	if (!ExperimentalVectorWideningLegalization) {
	// Avoid narrow result types when widening. The legal types are listed
	// in the next loop.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
	}
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	if (!ExperimentalVectorWideningLegalization)
	setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);

	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	// These types need custom splitting if their input is a 128-bit vector.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

	setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v16i16, Custom);

	// With BWI, expanding (and promoting the shifts) is the better.
	if (!Subtarget.hasBWI())
	setOperationAction(ISD::ROTL, MVT::v32i8, Custom);

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::FMA, VT, Legal);
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	setOperationAction(ISD::ABS, MVT::v4i64, Custom);
	setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
	setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

	setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	if (HasInt256) {
	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::STORE, VT, Custom);
	}

	if (HasInt256)
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	if (HasInt256) {
	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
	setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MGATHER, VT, Custom);
	}
	}

	// This block controls legalization of the mask vector sizes that are
	// available with AVX512. 512-bit vectors are in a separate block controlled
	// by useAVX512Regs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

	// There is no byte sized k-register load or store without AVX512DQ.
	if (!Subtarget.hasDQI()) {
	setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

	setOperationAction(ISD::STORE, MVT::v1i1, Custom);
	setOperationAction(ISD::STORE, MVT::v2i1, Custom);
	setOperationAction(ISD::STORE, MVT::v4i1, Custom);
	setOperationAction(ISD::STORE, MVT::v8i1, Custom);
	}

	// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	}

	// This block controls legalization for 512-bit operations with 32/64 bit
	// elements. 512-bits can be disabled based on prefer-vector-width and
	// required-vector-width function attributes.
	if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

	// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
	// to 512-bit rather than use the AVX2 instructions so that we can use
	// k-masks.
	if (!Subtarget.hasVLX()) {
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

	if (ExperimentalVectorWideningLegalization) {
	// Need to custom widen this if we don't have AVX512BW.
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Custom);
	}

	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
	for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
	setOperationAction(ISD::MUL, MVT::v16i32, Legal);

	setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	if (Subtarget.hasDQI()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);

	setOperationAction(ISD::MUL, MVT::v8i64, Legal);
	}

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Legal under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	// Need to custom split v32i16/v64i8 bitcasts.
	if (!Subtarget.hasBWI()) {
	setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
	}

	if (Subtarget.hasVBMI2()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}
	}// has AVX-512

	// This block controls legalization for operations that don't have
	// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
	// narrower widths.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
	setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MSCATTER, VT, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT, Legal);
	setOperationAction(ISD::UINT_TO_FP, VT, Legal);
	setOperationAction(ISD::FP_TO_SINT, VT, Legal);
	setOperationAction(ISD::FP_TO_UINT, VT, Legal);

	setOperationAction(ISD::MUL, VT, Legal);
	}
	}

	if (Subtarget.hasCDI()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	// This block control legalization of v32i1/v64i1 which are available with
	// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
	// useBWIRegs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
	for (auto VT : { MVT::v16i1, MVT::v32i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v32i1 masks to 256-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
	}

	// This block controls legalization for v32i16 and v64i8. 512-bits can be
	// disabled based on prefer-vector-width and required-vector-width function
	// attributes.
	if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	// Extends from v64i1 masks to 512-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::UADDSAT, VT, Legal);
	setOperationAction(ISD::SADDSAT, VT, Legal);
	setOperationAction(ISD::USUBSAT, VT, Legal);
	setOperationAction(ISD::SSUBSAT, VT, Legal);
	setOperationAction(ISD::SELECT, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	}

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v64i8, MVT::v32i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	if (Subtarget.hasVBMI2()) {
	setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
	setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
	}

	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

	if (Subtarget.hasDQI()) {
	// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
	// v2f32 UINT_TO_FP is already custom under SSE2.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
	"Unexpected operation action!");
	// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	}

	if (Subtarget.hasBWI()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}

	if (Subtarget.hasVBMI2()) {
	// TODO: Make these legal even without VLX?
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	setLibcallName(RTLIB::MUL_I128, nullptr);
	}

	// Combine sin / cos into _sincos_stret if it is available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() &&
	(Subtarget.isTargetWindowsMSVC() \|\| Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
	ISD::FLOG10, ISD::FPOW, ISD::FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(ExperimentalPrefLoopAlignment);

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(4); // 2^4 bytes.

	verifyIntrinsicTables();
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	bool X86TargetLowering::useStackGuardXorFP() const {
	// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
	return Subtarget.getTargetTriple().isOSMSVCRT();
	}

	SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
	MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
	return SDValue(Node, 0);
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(MVT VT) const {
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return TypeSplitVector;

	if (ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return MVT::v32i8;
	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
	}

	unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return 1;
	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (Subtarget.hasAVX512()) {
	const unsigned NumElts = VT.getVectorNumElements();

	// Figure out what this type will be legalized to.
	EVT LegalVT = VT;
	while (getTypeAction(Context, LegalVT) != TypeLegal)
	LegalVT = getTypeToTransformTo(Context, LegalVT);

	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
	if (LegalVT.getSimpleVT().is512BitVector())
	return EVT::getVectorVT(Context, MVT::i1, NumElts);

	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
	// If we legalized to less than a 512-bit vector, then we will use a vXi1
	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
	// vXi16/vXi8.
	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= 32)
	return EVT::getVectorVT(Context, MVT::i1, NumElts);
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getBitWidth() == 128)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	unsigned TyAlign = DL.getABITypeAlignment(Ty);
	if (TyAlign > 8)
	return TyAlign;
	return 8;
	}

	unsigned Align = 4;
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Align);
	return Align;
	}

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	/// For vector ops we check that the overall size isn't larger than our
	/// preferred vector width.
	EVT X86TargetLowering::getOptimalMemOpType(
	uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const {
	if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() \|\|
	((DstAlign == 0 \|\| DstAlign >= 16) &&
	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Size >= 32 && Subtarget.hasAVX() &&
	(Subtarget.getPreferVectorWidth() >= 256)) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	// If we have SSE1 registers we should be able to use them.
	if (Subtarget.hasSSE1() && (Subtarget.is64Bit() \|\| Subtarget.hasX87()) &&
	(Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v4f32;
	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Size >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool X86TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// NonTemporal vector memory ops must be aligned.
	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
	// NT loads can only be vector aligned, so if its less aligned than the
	// minimum vector size (which we can split the vector down to), we might as
	// well use a regular unaligned vector load.
	// We don't have any NT loads pre-SSE41.
	if (!!(Flags & MachineMemOperand::MOLoad))
	return (Align < 16 \|\| !Subtarget.hasSSE41());
	return false;
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction().getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isIntOrPtrTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::X86_FastCall);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getGlobalVariable("__security_cookie");
	}
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getFunction("__security_check_cookie");
	}
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
	DAG.getIntPtrConstant(0, Dl));

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	}

	if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	}

	return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction().hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// If this is x86-64, and we disabled SSE, we can't return FP values,
	// or SSE or MMX vectors.
	if ((ValVT == MVT::f32 \|\| ValVT == MVT::f64 \|\|
	VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (ValVT == MVT::f64 &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
	// Likewise we can't return F64 values with SSE1 only. gcc does so, but
	// llvm-gcc has never done it right and no one has noticed, so this
	// should be OK for now.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetOps.push_back(ValToCopy);
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
	Subtarget);

	assert(2 == RegsToPass.size() &&
	"Expecting two registers after Pass64BitArgInRegs");

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}

	// Add nodes to the DAG and add the values into the RetOps list
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers.
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type.
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type.
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together.
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	bool Is64Bit = Subtarget.is64Bit();
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64 \|\| CopyVT == MVT::f128) &&
	((Is64Bit \|\| Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	/isVolatile/false, /AlwaysInline=/true,
	/isTailCall/false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	// Swift:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	auto Attr =
	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
	return false;

	ImmutableCallSite CS(CI);
	CallingConv::ID CalleeCC = CS.getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

	// FIXME: For now, all byval parameter objects are marked as aliasing. This
	// can be improved with deeper analysis.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
	/isAliased=/true);
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	// If the argument is passed directly in memory without any extension, then we
	// can perform copy elision. Large vector types, for example, may be passed
	// indirectly by pointer.
	if (Flags.isCopyElisionCandidate() &&
	VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/IsImmutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function &F = MF.getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
	[](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	const Function &F = MF.getFunction();
	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
	F.getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i8)
	RC = &X86::GR8RegClass;
	else if (RegVT == MVT::i16)
	RC = &X86::GR16RegClass;
	else if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::VR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	unsigned Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (MFI.hasVAStart() &&
	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall))) {
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	// Gather all the live in physical registers.
	SmallVector<SDValue, 6> LiveGPRs;
	SmallVector<SDValue, 8> LiveXMMRegs;
	SDValue ALVal;
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(
	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
	}
	if (!ArgXMMs.empty()) {
	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
	}
	}

	if (IsWin64) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, dl));
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
	// Now store the XMM (fp + vector) parameter registers.
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getRegSaveFrameIndex(), dl));
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getVarArgsFPOffset(), dl));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	}

	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.hasAVX512() &&
	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Conservatively forward AL on x86_64, since it might be used for varargs.
	if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &FR : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
	FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
	Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
	}
	}

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	if (CallConv == CallingConv::X86_FastCall \|\|
	CallConv == CallingConv::X86_ThisCall)
	// fastcc functions can't have varargs.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	F.hasFnAttribute("no_caller_saved_registers")) {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
	MRI.disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (Flags.isByVal())
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
	const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
	const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
	bool HasNoCfCheck =
	(CI && CI->doesNoCfCheck()) \|\| (II && II->doesNoCfCheck());
	const Module *M = MF.getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

	MachineFunction::CallSiteInfo CSInfo;

	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Attr.getValueAsString() == "true")
	isTailCall = false;

	if (Subtarget.isPICStyleGOT() &&
	!MF.getTarget().Options.GuaranteedTailCallOpt) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction().hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
	canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	}

	if (!IsSibcall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	if (isByVal) {
	// Memcpy the argument to a temporary stack slot to prevent
	// the caller from seeing any modifications the callee may make
	// as guaranteed by the `byval` attribute.
	int FrameIdx = MF.getFrameInfo().CreateStackObject(
	Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
	false);
	SDValue StackSlot =
	DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
	Chain =
	CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
	// From now on treat this as a regular pointer
	Arg = StackSlot;
	isByVal = false;
	} else {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	}
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
	Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.EnableDebugEntryValues)
	CSInfo.emplace_back(VA.getLocReg(), I);
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	unsigned ShadowReg = 0;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");

	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca arguments. They don't require any work.
	if (Flags.isInAlloca())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress \|\|
	Callee->getOpcode() == ISD::ExternalSymbol) {
	// Lower direct calls to global addresses and external symbols. Setting
	// ForCall to true here has the effect of removing WrapperRIP when possible
	// to allow direct calls to be selected without first materializing the
	// address into a register.
	Callee = LowerGlobalOrExternal(Callee, DAG, /ForCall=/true);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
	const Function &CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn.hasPersonalityFn()
	? classifyEHPersonality(CallerFn.getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegMask();
	unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
	memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
	return Ret;
	}

	if (HasNoCfCheck && IsCFProtectionSupported) {
	Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
	} else {
	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	}
	InFlag = Chain.getValue(1);
	DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

	// Save heapallocsite metadata.
	if (CLI.CS)
	if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
	DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
	// No need to reset the stack after the call if the call doesn't return. To
	// make the MI verify, we'll pretend the callee does it for us.
	NumBytesForCalleeToPop = NumBytes;
	}

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG& DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	uint64_t AlignMask = StackAlignment - 1;
	int64_t Offset = StackSize;
	unsigned SlotSize = RegInfo->getSlotSize();
	if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
	// Number smaller than 12 so just add the difference.
	Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
	} else {
	// Mask out lower bits, add stackalignment once plus the 12 bytes.
	Offset = ((~AlignMask) & Offset) + StackAlignment +
	(StackAlignment-SlotSize);
	}
	return Offset;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	unsigned Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::OR:
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// Return true if the condition is an unsigned comparison operation.
	static bool isX86CCUnsigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return true;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return false;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.flags = MachineMemOperand::MONone;
	Info.offset = 0;

	switch (IntrData->Type) {
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	case GATHER:
	case GATHER_AVX2: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOLoad;
	break;
	}
	case SCATTER: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

	// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
	// those uses are extracted directly into a store, then the extract + store
	// can be store-folded. Therefore, it's probably not worth splitting the load.
	EVT VT = Load->getValueType(0);
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) && !Load->hasOneUse()) {
	for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
	// Skip uses of the chain value. Result 0 of the node is the load value.
	if (UI.getUse().getResNo() != 0)
	continue;

	// If this use is not an extract + store, it's probably worth splitting.
	if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\| !UI->hasOneUse() \|\|
	UI->use_begin()->getOpcode() != ISD::STORE)
	return true;
	}
	// All non-chain uses are extract + store.
	return false;
	}

	return true;
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
	// If we are using XMM registers in the ABI and the condition of the select is
	// a floating-point compare and we have blendv or conditional move, then it is
	// cheaper to select instead of doing a cross-register move and creating a
	// load that depends on the compare result.
	return !IsFPSetCC \|\| !Subtarget.isTarget64BitLP64() \|\| !Subtarget.hasAVX();
	}

	bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
	// TODO: It might be a win to ease or lift this restriction, but the generic
	// folds in DAGCombiner conflict with vector folds for an AVX512 target.
	if (VT.isVector() && Subtarget.hasAVX512())
	return false;

	return true;
	}

	bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
	// TODO: We handle scalars using custom code, but generic combining could make
	// that unnecessary.
	APInt MulC;
	if (!ISD::isConstantSplatVector(C.getNode(), MulC))
	return false;

	// If vector multiply is legal, assume that's faster than shl + add/sub.
	// TODO: Multiply is a complex op with higher latency and lower througput in
	// most implementations, so this check could be loosened based on type
	// and/or a CPU attribute.
	if (isOperationLegal(ISD::MUL, VT))
	return false;

	// shl+add, shl+sub, shl+add+neg
	return (MulC + 1).isPowerOf2() \|\| (MulC - 1).isPowerOf2() \|\|
	(1 - MulC).isPowerOf2() \|\| (-(MulC + 1)).isPowerOf2();
	}

	bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
	bool IsSigned) const {
	// f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
	return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	// Mask vectors support all subregister combinations and operations that
	// extract half of vector.
	if (ResVT.getVectorElementType() == MVT::i1)
	return Index == 0 \|\| ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
	(Index == ResVT.getVectorNumElements()));

	return (Index % ResVT.getVectorNumElements()) == 0;
	}

	bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
	unsigned Opc = VecOp.getOpcode();

	// Assume target opcodes can't be scalarized.
	// TODO - do we have any exceptions?
	if (Opc >= ISD::BUILTIN_OP_END)
	return false;

	// If the vector op is not supported, try to convert to scalar.
	EVT VecVT = VecOp.getValueType();
	if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
	return true;

	// If the vector op is supported, but the scalar op is not, the transform may
	// not be worthwhile.
	EVT ScalarVT = VecVT.getScalarType();
	return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
	}

	bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
	// TODO: Allow vectors?
	if (VT.isVector())
	return false;
	return VT.isSimple() \|\| !isOperationExpand(Opcode, VT);
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const {
	if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
	BitcastVT.getVectorElementType() == MVT::i1)
	return false;

	if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
	return false;

	// If both types are legal vectors, it's always ok to convert them.
	if (LoadVT.isVector() && BitcastVT.isVector() &&
	isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
	return true;

	return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
	}

	bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.
	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat) {
	unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
	return (MemVT.getSizeInBits() <= MaxIntSize);
	}
	// Make sure we don't merge greater than our preferred vector
	// width.
	if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
	return false;
	return true;
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (VT.isVector())
	return false;

	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return !isa<ConstantSDNode>(Y);
	}

	bool X86TargetLowering::hasAndNot(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (!VT.isVector())
	return hasAndNotCompare(Y);

	// Vector.

	if (!Subtarget.hasSSE1() \|\| VT.getSizeInBits() < 128)
	return false;

	if (VT == MVT::v4i32)
	return true;

	return Subtarget.hasSSE2();
	}

	bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
	const SDNode *N, CombineLevel Level) const {
	assert(((N->getOpcode() == ISD::SHL &&
	N->getOperand(0).getOpcode() == ISD::SRL) \|\|
	(N->getOpcode() == ISD::SRL &&
	N->getOperand(0).getOpcode() == ISD::SHL)) &&
	"Expected shift-shift mask");
	EVT VT = N->getValueType(0);
	if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) \|\|
	(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
	// Only fold if the shift values are equal - so it folds to AND.
	// TODO - we should fold if either is a non-uniform vector but we don't do
	// the fold for non-splats yet.
	return N->getOperand(1) == N->getOperand(0).getOperand(1);
	}
	return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
	}

	bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
	EVT VT = Y.getValueType();

	// For vectors, we don't have a preference, but we probably want a mask.
	if (VT.isVector())
	return false;

	// 64-bit shifts on 32-bit targets produce really bad bloated code.
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	return false;

	return true;
	}

	+bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
	+ SDNode *N) const {
	+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
	+ !Subtarget.isOSWindows())
	+ return false;
	+ return true;
	+}
	+
	bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
	// Any legal vector type can be splatted more efficiently than
	// loading/spilling from memory.
	return isTypeLegal(VT);
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning from position Pos and ending
	/// in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (Mask[i] != SM_SentinelUndef)
	return false;
	return true;
	}

	/// Return true if the mask creates a vector whose lower half is undefined.
	static bool isUndefLowerHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, 0, NumElts / 2);
	}

	/// Return true if the mask creates a vector whose upper half is undefined.
	static bool isUndefUpperHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
	}

	/// Return true if Val falls within the specified range (L, H].
	static bool isInRange(int Val, int Low, int Hi) {
	return (Val >= Low && Val < Hi);
	}

	/// Return true if the value of any element in Mask falls within the specified
	/// range (L, H].
	static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (isInRange(M, Low, Hi))
	return true;
	return false;
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask,
	int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrZeroOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos + Size, falls within the specified
	/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low, int Step = 1) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (!isUndefOrZero(Mask[i]))
	return false;
	return true;
	}

	/// Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	const APInt &Zeroable,
	SmallVectorImpl<int> &WidenedMask) {
	SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	if (TargetMask[i] == SM_SentinelUndef)
	continue;
	if (Zeroable[i])
	TargetMask[i] = SM_SentinelZero;
	}
	return canWidenShuffleElements(TargetMask, WidenedMask);
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask) {
	SmallVector<int, 32> WidenedMask;
	return canWidenShuffleElements(Mask, WidenedMask);
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(ResultVT, dl,
	Vec->ops().slice(IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
	Vec.getValueType().getScalarType() == VT.getScalarType() &&
	"Unsupported vector widening type");
	SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
	: DAG.getUNDEF(VT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl, unsigned WideSizeInBits) {
	assert(Vec.getValueSizeInBits() < WideSizeInBits &&
	(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
	"Unsupported vector widening type");
	unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
	MVT SVT = Vec.getSimpleValueType().getScalarType();
	MVT VT = MVT::getVectorVT(SVT, WideNumElts);
	return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
	}

	// Helper function to collect subvector ops that are concated together,
	// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
	// The subvectors in Ops are guaranteed to be the same type.
	static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
	assert(Ops.empty() && "Expected an empty ops vector");

	if (N->getOpcode() == ISD::CONCAT_VECTORS) {
	Ops.append(N->op_begin(), N->op_end());
	return true;
	}

	if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	SDValue Src = N->getOperand(0);
	SDValue Sub = N->getOperand(1);
	const APInt &Idx = N->getConstantOperandAPInt(2);
	EVT VT = Src.getValueType();
	EVT SubVT = Sub.getValueType();

	// TODO - Handle more general insert_subvector chains.
	if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
	Idx == (VT.getVectorNumElements() / 2) &&
	Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(1).getValueType() == SubVT &&
	isNullConstant(Src.getOperand(2))) {
	Ops.push_back(Src.getOperand(1));
	Ops.push_back(Sub);
	return true;
	}
	}

	return false;
	}

	// Helper for splitting operands of an operation to legal target size and
	// apply a function on each part.
	// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
	// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
	// deciding if/how to split Ops. Ops elements do not have to be of type VT.
	// The argument Builder is a function that will be applied on each split part:
	// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
	template <typename F>
	SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
	const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
	F Builder, bool CheckBWI = true) {
	assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
	unsigned NumSubs = 1;
	if ((CheckBWI && Subtarget.useBWIRegs()) \|\|
	(!CheckBWI && Subtarget.useAVX512Regs())) {
	if (VT.getSizeInBits() > 512) {
	NumSubs = VT.getSizeInBits() / 512;
	assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
	}
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256) {
	NumSubs = VT.getSizeInBits() / 256;
	assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
	}
	} else {
	if (VT.getSizeInBits() > 128) {
	NumSubs = VT.getSizeInBits() / 128;
	assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
	}
	}

	if (NumSubs == 1)
	return Builder(DAG, DL, Ops);

	SmallVector<SDValue, 4> Subs;
	for (unsigned i = 0; i != NumSubs; ++i) {
	SmallVector<SDValue, 2> SubOps;
	for (SDValue Op : Ops) {
	EVT OpVT = Op.getValueType();
	unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
	unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
	SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
	}
	Subs.push_back(Builder(DAG, DL, SubOps));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	// Inserting undef is a nop. We can just return the original vector.
	if (SubVec.isUndef())
	return Vec;

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

	// Extend to natively supported kshift.
	MVT WideOpVT = OpVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

	// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
	// if necessary.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// May need to promote to a legal type.
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	SDValue Undef = DAG.getUNDEF(WideOpVT);

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
	ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, ZeroIdx);
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	if (Vec.isUndef()) {
	assert(IdxVal != 0 && "Unexpected index");
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	assert(IdxVal != 0 && "Unexpected index");
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	if (ShiftRight != 0)
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	if (SubVecNumElems * 2 == NumElems) {
	// Special case, use legal zero extending insert_subvector. This allows
	// isel to opimitize when bits are known zero.
	Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	Vec, ZeroIdx);
	} else {
	// Otherwise use explicit shifts to zero the bits.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, Vec, ZeroIdx);
	NumElems = WideOpVT.getVectorNumElements();
	SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	}
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Inserting into the middle is more complicated.

	NumElems = WideOpVT.getVectorNumElements();

	// Widen the vector if needed.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	// Move the current value of the bit to be replace to the lsbs.
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Xor with the new bit.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
	// Shift to MSB, filling bottom bits with 0.
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	// Shift to the final position, filling upper bits with 0.
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	// Xor with original vector leaving the new value.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl, unsigned VectorWidth) {
	SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
	return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	// Convert _EXTEND to _EXTEND_VECTOR_INREG opcode.
	static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
	switch (Opcode) {
	case ISD::ANY_EXTEND:
	case ISD::ANY_EXTEND_VECTOR_INREG:
	return ISD::ANY_EXTEND_VECTOR_INREG;
	case ISD::ZERO_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return ISD::ZERO_EXTEND_VECTOR_INREG;
	case ISD::SIGN_EXTEND:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return ISD::SIGN_EXTEND_VECTOR_INREG;
	}
	llvm_unreachable("Unknown opcode");
	}

	static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue In, SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
	assert((ISD::ANY_EXTEND == Opcode \|\| ISD::SIGN_EXTEND == Opcode \|\|
	ISD::ZERO_EXTEND == Opcode) &&
	"Unknown extension opcode");

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (InVT.getSizeInBits() > 128) {
	assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
	"Expected VTs to be the same size!");
	unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128U, VT.getSizeInBits() / Scale));
	InVT = In.getValueType();
	}

	if (VT.getVectorNumElements() != InVT.getVectorNumElements())
	Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);

	return DAG.getNode(Opcode, DL, VT, In);
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static const Constant getTargetConstantFromNode(LoadSDNode Load) {
	if (!Load \|\| !ISD::isNormalLoad(Load))
	return nullptr;

	SDValue Ptr = Load->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry() \|\| CNode->getOffset() != 0)
	return nullptr;

	return CNode->getConstVal();
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);
	return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
	}

	const Constant *
	X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
	assert(LD && "Unexpected null LoadSDNode");
	return getTargetConstantFromNode(LD);
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Handle UNDEFs.
	if (Op.isUndef()) {
	APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
	SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract scalar constant bits.
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SmallVector<APInt, 64> SrcEltBits(1, RawBits);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantFPSDNode>(Src);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
	if (!CstTy->isVectorTy() \|\| (CstSizeInBits % SizeInBits) != 0)
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract constant bits from a subvector broadcast.
	if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
	SmallVector<APInt, 16> SubEltBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, SubEltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	UndefElts = APInt::getSplat(NumElts, UndefElts);
	while (EltBits.size() < NumElts)
	EltBits.append(SubEltBits.begin(), SubEltBits.end());
	return true;
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Insert constant bits from a base and sub vector sources.
	if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(Op.getOperand(2))) {
	// TODO - support insert_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	APInt UndefSubElts;
	SmallVector<APInt, 32> EltSubBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefSubElts, EltSubBits,
	AllowWholeUndefs, AllowPartialUndefs) &&
	getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	unsigned BaseIdx = Op.getConstantOperandVal(2);
	UndefElts.insertBits(UndefSubElts, BaseIdx);
	for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
	EltBits[BaseIdx + i] = EltSubBits[i];
	return true;
	}
	}

	// Extract constant bits from a subvector's source.
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(Op.getOperand(1))) {
	// TODO - support extract_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	EVT SrcVT = Op.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = VT.getVectorNumElements();
	unsigned BaseIdx = Op.getConstantOperandVal(1);
	UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
	if ((BaseIdx + NumSubElts) != NumSrcElts)
	EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
	if (BaseIdx != 0)
	EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
	return true;
	}
	}

	// Extract constant bits from shuffle node sources.
	if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
	// TODO - support shuffle through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	ArrayRef<int> Mask = SVN->getMask();
	if ((!AllowWholeUndefs \|\| !AllowPartialUndefs) &&
	llvm::any_of(Mask, [](int M) { return M < 0; }))
	return false;

	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (isAnyInRange(Mask, 0, NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts0, EltBits0, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;
	if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefElts1, EltBits1, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;

	UndefElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != (int)NumElts; ++i) {
	int M = Mask[i];
	if (M < 0) {
	UndefElts.setBit(i);
	EltBits.push_back(APInt::getNullValue(EltSizeInBits));
	} else if (M < (int)NumElts) {
	if (UndefElts0[M])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits0[M]);
	} else {
	if (UndefElts1[M - NumElts])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits1[M - NumElts]);
	}
	}
	return true;
	}

	return false;
	}

	static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
	UndefElts, EltBits, true, false)) {
	int SplatIndex = -1;
	for (int i = 0, e = EltBits.size(); i != e; ++i) {
	if (UndefElts[i])
	continue;
	if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
	SplatIndex = -1;
	break;
	}
	SplatIndex = i;
	}
	if (0 <= SplatIndex) {
	SplatVal = EltBits[SplatIndex];
	return true;
	}
	}

	return false;
	}

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask,
	APInt &UndefElts) {
	// Extract the raw target constant bits.
	SmallVector<APInt, 64> EltBits;
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
	/// Note: This ignores saturation, so inputs must be checked first.
	static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
	unsigned Offset = Unary ? 0 : NumElts;

	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane));
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
	}
	}

	// Split the demanded elts of a PACKSS/PACKUS node between its operands.
	static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumInnerElts = NumElts / 2;
	int NumEltsPerLane = NumElts / NumLanes;
	int NumInnerEltsPerLane = NumInnerElts / NumLanes;

	DemandedLHS = APInt::getNullValue(NumInnerElts);
	DemandedRHS = APInt::getNullValue(NumInnerElts);

	// Map DemandedElts to the packed operands.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
	int OuterIdx = (Lane * NumEltsPerLane) + Elt;
	int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
	if (DemandedElts[OuterIdx])
	DemandedLHS.setBit(InnerIdx);
	if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
	DemandedRHS.setBit(InnerIdx);
	}
	}
	}

	// Split the demanded elts of a HADD/HSUB node between its operands.
	static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumEltsPerLane = NumElts / NumLanes;
	int HalfEltsPerLane = NumEltsPerLane / 2;

	DemandedLHS = APInt::getNullValue(NumElts);
	DemandedRHS = APInt::getNullValue(NumElts);

	// Map DemandedElts to the horizontal operands.
	for (int Idx = 0; Idx != NumElts; ++Idx) {
	if (!DemandedElts[Idx])
	continue;
	int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
	int LocalIdx = Idx % NumEltsPerLane;
	if (LocalIdx < HalfEltsPerLane) {
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	} else {
	LocalIdx -= HalfEltsPerLane;
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	}
	}
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	APInt RawUndefs;
	SDValue ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch (N->getOpcode()) {
	case X86ISD::BLENDI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeSHUFPMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeZeroMoveLowMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST: {
	SDValue N0 = N->getOperand(0);
	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
	// add the pre-extracted value to the Ops vector.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == VT &&
	N0.getConstantOperandVal(1) == 0)
	Ops.push_back(N0.getOperand(0));

	// We only decode broadcasts of same-sized vectors, unless the broadcast
	// came from an extract from the original width. If we found one, we
	// pushed it the Ops vector above.
	if (N0.getValueType() == VT \|\| !Ops.empty()) {
	DecodeVectorBroadcast(NumElems, Mask);
	IsUnary = true;
	break;
	}
	return false;
	}
	case X86ISD::VPERMILPV: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodePSHUFBMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUF128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSLDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSHDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVDDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VPERMIL2: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
	Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodeVPPERMMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMVMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero)
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Check a target shuffle mask's inputs to see if we can set any values to
	/// SM_SentinelZero - this is for elements that are known to be zero
	/// (not just zeroable) from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	static bool setTargetShuffleZeroElements(SDValue N,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Mask.size()) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0)
	continue;

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	Mask[i] = SM_SentinelUndef;
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	Mask[i] = SM_SentinelUndef;
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	Mask[i] = SM_SentinelZero;
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	Mask[i] = SM_SentinelUndef;
	else if (SrcEltBits[SrcIdx][M] == 0)
	Mask[i] = SM_SentinelZero;
	}
	}

	assert(VT.getVectorNumElements() == Mask.size() &&
	"Different mask size from vector size!");
	return true;
	}

	// Forward declaration (for getFauxShuffleMask recursive check).
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG);

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	SelectionDAG &DAG) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	if ((NumBitsPerElt % 8) != 0 \|\| (NumSizeInBits % 8) != 0)
	return false;
	assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::VECTOR_SHUFFLE: {
	// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
	ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
	if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
	Mask.append(ShuffleMask.begin(), ShuffleMask.end());
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	return false;
	}
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	uint64_t ByteBits = EltBits[i].getZExtValue();
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::OR: {
	// Inspect each operand at the byte level. We can merge these into a
	// blend shuffle mask if for each byte at least one is masked out (zero).
	KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
	KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
	if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
	bool IsByteMask = true;
	unsigned NumSizeInBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
	APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
	for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
	unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
	unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
	if (LHS == 255 && RHS == 0)
	SelectMask.setBit(i);
	else if (LHS == 255 && RHS == 255)
	ZeroMask.setBit(i);
	else if (!(LHS == 0 && RHS == 255))
	IsByteMask = false;
	}
	if (IsByteMask) {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
	for (unsigned j = 0; j != NumBytesPerElt; ++j) {
	unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
	int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
	Mask.push_back(Idx);
	}
	}
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	}

	// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
	// is a valid shuffle index.
	SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
	SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
	if (!N0.getValueType().isVector() \|\| !N1.getValueType().isVector())
	return false;
	SmallVector<int, 64> SrcMask0, SrcMask1;
	SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
	if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) \|\|
	!resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
	return false;
	int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
	SmallVector<int, 64> Mask0, Mask1;
	scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
	scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
	for (int i = 0; i != MaskSize; ++i) {
	if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
	Mask.push_back(SM_SentinelUndef);
	else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
	Mask.push_back(SM_SentinelZero);
	else if (Mask1[i] == SM_SentinelZero)
	Mask.push_back(Mask0[i]);
	else if (Mask0[i] == SM_SentinelZero)
	Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
	else
	return false;
	}
	for (SDValue &Op : SrcInputs0)
	Ops.push_back(Op);
	for (SDValue &Op : SrcInputs1)
	Ops.push_back(Op);
	return true;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue Src = N.getOperand(0);
	SDValue Sub = N.getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	if (!isa<ConstantSDNode>(N.getOperand(2)) \|\|
	!N->isOnlyUserOf(Sub.getNode()))
	return false;
	uint64_t InsertIdx = N.getConstantOperandVal(2);
	// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
	if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Sub.getOperand(0).getValueType() == VT &&
	isa<ConstantSDNode>(Sub.getOperand(1))) {
	uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i)
	Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
	Ops.push_back(Src);
	Ops.push_back(Sub.getOperand(0));
	return true;
	}
	// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
	SmallVector<int, 64> SubMask;
	SmallVector<SDValue, 2> SubInputs;
	if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
	SubMask, DAG))
	return false;
	if (SubMask.size() != NumSubElts) {
	assert(((SubMask.size() % NumSubElts) == 0 \|\|
	(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
	if ((NumSubElts % SubMask.size()) == 0) {
	int Scale = NumSubElts / SubMask.size();
	SmallVector<int,64> ScaledSubMask;
	scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
	SubMask = ScaledSubMask;
	} else {
	int Scale = SubMask.size() / NumSubElts;
	NumSubElts = SubMask.size();
	NumElts *= Scale;
	InsertIdx *= Scale;
	}
	}
	Ops.push_back(Src);
	for (SDValue &SubInput : SubInputs) {
	EVT SubSVT = SubInput.getValueType().getScalarType();
	EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
	NumSizeInBits / SubSVT.getSizeInBits());
	Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
	DAG.getUNDEF(AltVT), SubInput,
	DAG.getIntPtrConstant(0, SDLoc(N))));
	}
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i) {
	int M = SubMask[i];
	if (0 <= M) {
	int InputIdx = M / NumSubElts;
	M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
	}
	Mask[i + InsertIdx] = M;
	}
	return true;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// Match against a scalar_to_vector of an extract from a vector,
	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
	SDValue N0 = N.getOperand(0);
	SDValue SrcExtract;

	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getOperand(0).getValueType() == VT) \|\|
	(N0.getOpcode() == X86ISD::PEXTRW &&
	N0.getOperand(0).getValueType() == MVT::v8i16) \|\|
	(N0.getOpcode() == X86ISD::PEXTRB &&
	N0.getOperand(0).getValueType() == MVT::v16i8)) {
	SrcExtract = N0;
	}

	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	if (NumSrcElts <= SrcIdx)
	return false;

	Ops.push_back(SrcVec);
	Mask.push_back(SrcIdx);
	Mask.append(NumZeros, SM_SentinelZero);
	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue InVec = N.getOperand(0);
	SDValue InScl = N.getOperand(1);
	SDValue InIndex = N.getOperand(2);
	if (!isa<ConstantSDNode>(InIndex) \|\|
	cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
	return false;
	uint64_t InIdx = N.getConstantOperandVal(2);

	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
	if (X86::isZeroNode(InScl)) {
	Ops.push_back(InVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
	return true;
	}

	// Attempt to recognise a PINSR(PEXTR) shuffle pattern.
	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
	unsigned ExOp =
	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
	if (InScl.getOpcode() != ExOp)
	return false;

	SDValue ExVec = InScl.getOperand(0);
	SDValue ExIndex = InScl.getOperand(1);
	if (!isa<ConstantSDNode>(ExIndex) \|\|
	cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
	return false;
	uint64_t ExIdx = InScl.getConstantOperandVal(1);

	Ops.push_back(InVec);
	Ops.push_back(ExVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
	return true;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
	N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
	"Unexpected input value type");

	APInt EltsLHS, EltsRHS;
	getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (Opcode == X86ISD::PACKSS) {
	if ((!N0.isUndef() &&
	DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) \|\|
	(!N1.isUndef() &&
	DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
	return false;
	} else {
	APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
	if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) \|\|
	(!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
	return false;
	}

	bool IsUnary = (N0 == N1);

	Ops.push_back(N0);
	if (!IsUnary)
	Ops.push_back(N1);

	createPackShuffleMask(VT, Mask, IsUnary);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	unsigned NumBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (!SrcVT.isVector())
	return false;

	if (NumSizeInBits != SrcVT.getSizeInBits()) {
	assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
	"Illegal broadcast type");
	SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
	NumSizeInBits / SrcVT.getScalarSizeInBits());
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
	DAG.getUNDEF(SrcVT), Src,
	DAG.getIntPtrConstant(0, SDLoc(N)));
	}

	Ops.push_back(Src);
	Mask.append(NumElts, 0);
	return true;
	}
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::ANY_EXTEND_VECTOR_INREG: {
	SDValue Src = N.getOperand(0);
	EVT SrcVT = Src.getValueType();

	// Extended source must be a simple vector.
	if (!SrcVT.isSimple() \|\| (SrcVT.getSizeInBits() % 128) != 0 \|\|
	(SrcVT.getScalarSizeInBits() % 8) != 0)
	return false;

	unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
	bool IsAnyExtend =
	(ISD::ANY_EXTEND == Opcode \|\| ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
	DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
	Mask);

	if (NumSizeInBits != SrcVT.getSizeInBits()) {
	assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
	"Illegal zero-extension type");
	SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
	NumSizeInBits / NumSrcBitsPerElt);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
	DAG.getUNDEF(SrcVT), Src,
	DAG.getIntPtrConstant(0, SDLoc(N)));
	}

	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;

	// Strip UNDEF input usage.
	if (Inputs[i].isUndef())
	for (int &M : Mask)
	if ((lo <= M) && (M < hi))
	M = SM_SentinelUndef;

	// Check for unused inputs.
	if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	continue;
	}

	// Check for repeated inputs.
	bool IsRepeat = false;
	for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
	if (UsedInputs[j] != Inputs[i])
	continue;
	for (int &M : Mask)
	if (lo <= M)
	M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
	IsRepeat = true;
	break;
	}
	if (IsRepeat)
	continue;

	UsedInputs.push_back(Inputs[i]);
	}
	Inputs = UsedInputs;
	}

	/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
	/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
	/// remaining input indices in case we now have a unary shuffle and adjust the
	/// inputs accordingly.
	/// Returns true if the target shuffle mask was decoded.
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG) {
	unsigned NumElts = Op.getValueType().getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
	if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
	return false;

	resolveTargetShuffleInputsAndMask(Inputs, Mask);
	return true;
	}

	/// Returns the scalar element that will make up the ith
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
	unsigned Depth) {
	if (Depth == 6)
	return SDValue(); // Limit search depth.

	SDValue V = SDValue(N, 0);
	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	unsigned NumElems = VT.getVectorNumElements();
	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
	: SV->getOperand(1);
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = V.getSimpleValueType();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
	Depth+1);
	}

	// Recurse into insert_subvector base/sub vector to find scalars.
	if (Opcode == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	SDValue Vec = N->getOperand(0);
	SDValue Sub = N->getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	uint64_t SubIdx = N->getConstantOperandVal(2);

	if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
	return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
	return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
	}

	// Recurse into extract_subvector src vector to find scalars.
	if (Opcode == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	SDValue Src = N->getOperand(0);
	uint64_t SrcIdx = N->getConstantOperandVal(1);
	return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
	}

	// Actual nodes that may contain scalar elements
	if (Opcode == ISD::BITCAST) {
	V = V.getOperand(0);
	EVT SrcVT = V.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
	return SDValue();
	}

	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? V.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Index);

	return SDValue();
	}

	// Use PINSRB/PINSRW/PINSRD to create a build vector.
	static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	((VT == MVT::v16i8 \|\| VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
	"Illegal vector insertion");

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (!IsNonZero)
	continue;

	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(VT, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getBitcast(VT, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
	DAG.getIntPtrConstant(i, dl));
	}

	return V;
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41())
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);

	SDLoc dl(Op);
	SDValue V;

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; i += 2) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
	if (!ThisIsNonZero && !NextIsNonZero)
	continue;

	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue Elt;
	if (ThisIsNonZero) {
	if (NumZero \|\| NextIsNonZero)
	Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	else
	Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	}

	if (NextIsNonZero) {
	SDValue NextElt = Op.getOperand(i + 1);
	if (i == 0 && NumZero)
	NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
	else
	NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
	NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (ThisIsNonZero)
	Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
	else
	Elt = NextElt;
	}

	// If our first insertion is not the first index then insert into zero
	// vector to break any register dependency else use SCALAR_TO_VECTOR.
	if (!V) {
	if (i != 0)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
	V = DAG.getBitcast(MVT::v8i16, V);
	continue;
	}
	}
	Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
	DAG.getIntPtrConstant(i / 2, dl));
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	// Use PINSRW to insert each byte directly.
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If this is a splat of a pair of elements, use MOVDDUP (unless the target
	// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
	// Because we're creating a less complicated build vector here, we may enable
	// further folding of the MOVDDUP via shuffle transforms.
	if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
	Op.getOperand(0) == Op.getOperand(2) &&
	Op.getOperand(1) == Op.getOperand(3) &&
	Op.getOperand(0) != Op.getOperand(1)) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	// Create a new build vector with the first 2 elements followed by undef
	// padding, bitcast to v2f64, duplicate, and bitcast back.
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
	SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
	return DAG.getBitcast(VT, Dup);
	}

	// Find all zeroable elements.
	std::bitset<4> Zeroable, Undefs;
	for (int i = 0; i < 4; ++i) {
	SDValue Elt = Op.getOperand(i);
	Undefs[i] = Elt.isUndef();
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i = 0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op.getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZeroOrUndef = (Zeroable == Undefs)
	? DAG.getUNDEF(VT)
	: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| LD->isVolatile())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	unsigned RequiredAlign = VT.getSizeInBits()/8;
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	APInt LoadMask = APInt::getNullValue(NumElems);
	APInt ZeroMask = APInt::getNullValue(NumElems);
	APInt UndefMask = APInt::getNullValue(NumElems);

	SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();
	if (Elt.isUndef()) {
	UndefMask.setBit(i);
	continue;
	}
	if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode())) {
	ZeroMask.setBit(i);
	continue;
	}

	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
	return SDValue();

	if (!ISD::isNON_EXTLoad(Elt.getNode()))
	return SDValue();

	Loads[i] = cast<LoadSDNode>(Elt);
	LoadMask.setBit(i);
	LastLoadedElt = i;
	}
	assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
	LoadMask.countPopulation()) == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.countPopulation() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.countTrailingZeros();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	EVT EltBaseVT = EltBase.getValueType();
	assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
	"Register/Memory size mismatch");
	LoadSDNode *LDBase = Loads[FirstLoadedElt];
	assert(LDBase && "Did not find base load for merging consecutive loads");
	unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
	unsigned BaseSizeInBytes = BaseSizeInBits / 8;
	int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
	assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
	i - FirstLoadedElt)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
	"Cannot merge volatile loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, NewLd);
	return NewLd;
	};

	// Check if the base load is entirely dereferenceable.
	bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
	VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

	// LOAD - all consecutive load/undefs (must start/end with a load or be
	// entirely dereferenceable). If we have found an entire vector of loads and
	// undefs, then return a large load of the entire vector width starting at the
	// base pointer. If the vector contains zeros, then attempt to shuffle those
	// elements.
	if (FirstLoadedElt == 0 &&
	(LastLoadedElt == (int)(NumElems - 1) \|\| IsDereferenceable) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (NumElems == 1)
	return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

	if (!ZeroMask)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && VT.isVector()) {
	SmallVector<int, 4> ClearMask(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (ZeroMask[i])
	ClearMask[i] = i + NumElems;
	else if (LoadMask[i])
	ClearMask[i] = i;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}

	// If the upper half of a ymm/zmm load is undef then just load the lower half.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned HalfNumElems = NumElems / 2;
	if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
	SDValue HalfLD =
	EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
	DAG, Subtarget, isAfterLegalize);
	if (HalfLD)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
	HalfLD, DAG.getIntPtrConstant(0, DL));
	}
	}

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSizeInBits == 32 \|\| LoadSizeInBits == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
	: MVT::getIntegerVT(LoadSizeInBits);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
	LDBase->getPointerInfo(),
	LDBase->getAlignment(),
	MachineMemOperand::MOLoad);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	// BROADCAST - match the smallest possible repetition pattern, load that
	// scalar/subvector element and then broadcast to the entire vector.
	if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
	(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector())) {
	for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
	unsigned RepeatSize = SubElems * BaseSizeInBits;
	unsigned ScalarSize = std::min(RepeatSize, 64u);
	if (!Subtarget.hasAVX2() && ScalarSize < 32)
	continue;

	bool Match = true;
	SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
	for (unsigned i = 0; i != NumElems && Match; ++i) {
	if (!LoadMask[i])
	continue;
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (RepeatedLoads[i % SubElems].isUndef())
	RepeatedLoads[i % SubElems] = Elt;
	else
	Match &= (RepeatedLoads[i % SubElems] == Elt);
	}

	// We must have loads at both ends of the repetition.
	Match &= !RepeatedLoads.front().isUndef();
	Match &= !RepeatedLoads.back().isUndef();
	if (!Match)
	continue;

	EVT RepeatVT =
	VT.isInteger() && (RepeatSize != 64 \|\| TLI.isTypeLegal(MVT::i64))
	? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
	: EVT::getFloatingPointVT(ScalarSize);
	if (RepeatSize > ScalarSize)
	RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
	RepeatSize / ScalarSize);
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
	VT.getSizeInBits() / ScalarSize);
	if (TLI.isTypeLegal(BroadcastVT)) {
	if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
	RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
	unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
	: X86ISD::VBROADCAST;
	SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
	return DAG.getBitcast(VT, Broadcast);
	}
	}
	}
	}

	return SDValue();
	}

	// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
	// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
	// are consecutive, non-overlapping, and in the right order.
	static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	SmallVector<SDValue, 64> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	return SDValue();
	}
	assert(Elts.size() == VT.getVectorNumElements());
	return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
	isAfterLegalize);
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isFoldableUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	unsigned Opc = U->getOpcode();
	// VPERMV/VPERMV3 shuffles can never fold their index operands.
	if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
	return false;
	if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
	return false;
	if (isTargetShuffle(Opc))
	return true;
	if (Opc == ISD::BITCAST) // Ignore bitcasts
	return isFoldableUseOfShuffle(U);
	if (N->hasOneUse())
	return true;
	}
	return false;
	}

	// Check if the current node of build vector is a zero extended vector.
	// // If so, return the value extended.
	// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
	// // NumElt - return the number of zero extended identical values.
	// // EltType - return the type of the value include the zero extend.
	static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
	unsigned &NumElt, MVT &EltType) {
	SDValue ExtValue = Op->getOperand(0);
	unsigned NumElts = Op->getNumOperands();
	unsigned Delta = NumElts;

	for (unsigned i = 1; i < NumElts; i++) {
	if (Op->getOperand(i) == ExtValue) {
	Delta = i;
	break;
	}
	if (!(Op->getOperand(i).isUndef() \|\| isNullConstant(Op->getOperand(i))))
	return SDValue();
	}
	if (!isPowerOf2_32(Delta) \|\| Delta == 1)
	return SDValue();

	for (unsigned i = Delta; i < NumElts; i++) {
	if (i % Delta == 0) {
	if (Op->getOperand(i) != ExtValue)
	return SDValue();
	} else if (!(isNullConstant(Op->getOperand(i)) \|\|
	Op->getOperand(i).isUndef()))
	return SDValue();
	}
	unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
	unsigned ExtVTSize = EltSize * Delta;
	EltType = MVT::getIntegerVT(ExtVTSize);
	NumElt = NumElts / Delta;
	return ExtValue;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// Attempt to use VBROADCASTM
	// From this paterrn:
	// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
	// b. t1 = (build_vector t0 t0)
	//
	// Create (VBROADCASTM v2i1 X)
	if (Subtarget.hasCDI() && (VT.is512BitVector() \|\| Subtarget.hasVLX())) {
	MVT EltType = VT.getScalarType();
	unsigned NumElts = VT.getVectorNumElements();
	SDValue BOperand;
	SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
	(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
	Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
	if (ZeroExtended)
	BOperand = ZeroExtended.getOperand(0);
	else
	BOperand = Ld.getOperand(0).getOperand(0);
	MVT MaskVT = BOperand.getSimpleValueType();
	if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) \|\| // for broadcastmb2q
	(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
	SDValue Brdcst =
	DAG.getNode(X86ISD::VBROADCASTM, dl,
	MVT::getVectorVT(EltType, NumElts), BOperand);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefElts = UndefElements.count();
	if (!Ld \|\| (NumElts - NumUndefElts) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isFoldableUseOfShuffle(BVOp))
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
	// Splatted value can fit in one FLOAT constant in constant pool.
	// Load the constant and broadcast it.
	// AVX have support for 32 and 64 bit broadcast for floats only.
	// No 64bit integer in 32bit subtarget.
	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
	// Lower the splat via APFloat directly, to avoid any conversion.
	Constant *C =
	SplatBitSize == 32
	? ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEsingle(), SplatValue))
	: ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEdouble(), SplatValue));
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	// If we are moving a scalar into a vector (Ld must be set and all elements
	// but 1 are undef) and that operation is not obviously supported by
	// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
	// That's better than general shuffling and may eliminate a load to GPR and
	// move from scalar to vector register.
	if (!Ld \|\| NumElts - NumUndefElts != 1)
	return SDValue();
	unsigned ScalarSize = Ld.getValueSizeInBits();
	if (!(UndefElements[0] \|\| (ScalarSize != 32 && ScalarSize != 64)))
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);

	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}
	}

	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	Op.getScalarValueSizeInBits() == 1 &&
	"Can not convert non-constant vector");
	uint64_t Immediate = 0;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (!In.isUndef())
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	}
	SDLoc dl(Op);
	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
	return DAG.getConstant(Immediate, dl, VT);
	}
	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorAllOnes(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	// Split the pieces.
	SDValue Lower =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
	SDValue Upper =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
	// We have to manually lower both halves so getNode doesn't try to
	// reassemble the build_vector.
	Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
	Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
	}
	SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, Imm);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Vector has one or more non-const elements
	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (!isa<ConstantSDNode>(In))
	NonConstIdx.push_back(idx);
	else {
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat)
	return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
	DAG.getConstant(1, dl, VT),
	DAG.getConstant(0, dl, VT));

	// insert elements one by one
	SDValue DstVec;
	SDValue Imm;
	if (Immediate) {
	MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
	Imm = DAG.getConstant(Immediate, dl, ImmVT);
	}
	else if (HasConstElts)
	Imm = DAG.getConstant(0, dl, VT);
	else
	Imm = DAG.getUNDEF(VT);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	DstVec = DAG.getBitcast(VT, Imm);
	else {
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
	/// may not match the layout of an x86 256-bit horizontal instruction.
	/// In other words, if this returns true, then some extraction/insertion will
	/// be required to produce a valid horizontal instruction.
	///
	/// Parameter \p Opcode defines the kind of horizontal operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	///
	/// TODO: This function was originally used to match both real and fake partial
	/// horizontal operations, but the index-matching logic is incorrect for that.
	/// See the corrected implementation in isHopBuildVector(). Can we reduce this
	/// code because it is only used for partial h-op matching now?
	static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);
	assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB/SUBADD operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
	/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
	/// \p Opnd0 and \p Opnd1.
	static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1,
	unsigned &NumExtracts,
	bool &IsSubAdd) {

	MVT VT = BV->getSimpleValueType(0);
	if (!Subtarget.hasSSE3() \|\| !VT.isFloatingPoint())
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	NumExtracts = 0;

	// Odd-numbered elements in the input build vector are obtained from
	// adding/subtracting two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting/adding two integer/float elements.
	unsigned Opc[2] = {0, 0};
	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF)
	continue;

	// Early exit if we found an unexpected opcode.
	if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (I0 != i)
	return false;

	// We found a valid add/sub node, make sure its the same opcode as previous
	// elements for this parity.
	if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
	return false;
	Opc[i % 2] = Opcode;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (Opcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Increment the number of extractions done.
	++NumExtracts;
	}

	// Ensure we have found an opcode for both parities and that they are
	// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
	// inputs are undef.
	if (!Opc[0] \|\| !Opc[1] \|\| Opc[0] == Opc[1] \|\|
	InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	IsSubAdd = Opc[0] == ISD::FADD;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
	/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
	/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
	unsigned ExpectedUses) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\|
	!Opnd0->hasNUsesOfValue(ExpectedUses, 0) \|\| !Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
	/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
	/// X86ISD::FMSUBADD node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	unsigned NumExtracts;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
	IsSubAdd))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	// We only support ADDSUB.
	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
	unsigned &HOpcode, SDValue &V0, SDValue &V1) {
	// Initialize outputs to known values.
	MVT VT = BV->getSimpleValueType(0);
	HOpcode = ISD::DELETED_NODE;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
	// half of the result is calculated independently from the 128-bit halves of
	// the inputs, so that makes the index-checking logic below more complicated.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned GenericOpcode = ISD::DELETED_NODE;
	unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
	unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
	unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
	for (unsigned i = 0; i != Num128BitChunks; ++i) {
	for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
	// Ignore undef elements.
	SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
	if (Op.isUndef())
	continue;

	// If there's an opcode mismatch, we're done.
	if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
	return false;

	// Initialize horizontal opcode.
	if (HOpcode == ISD::DELETED_NODE) {
	GenericOpcode = Op.getOpcode();
	switch (GenericOpcode) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default: return false;
	}
	}

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0.getOperand(0) != Op1.getOperand(0) \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\| !Op.hasOneUse())
	return false;

	// The source vector is chosen based on which 64-bit half of the
	// destination vector is being calculated.
	if (j < NumEltsIn64Bits) {
	if (V0.isUndef())
	V0 = Op0.getOperand(0);
	} else {
	if (V1.isUndef())
	V1 = Op0.getOperand(0);
	}

	SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
	if (SourceVec != Op0.getOperand(0))
	return false;

	// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
	unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
	unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
	unsigned ExpectedIndex = i * NumEltsIn128Bits +
	(j % NumEltsIn64Bits) * 2;
	if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
	continue;

	// If this is not a commutative op, this does not match.
	if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
	return false;

	// Addition is commutative, so try swapping the extract indexes.
	// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
	if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
	continue;

	// Extract indexes do not match horizontal requirement.
	return false;
	}
	}
	// We matched. Opcode and operands are returned by reference as arguments.
	return true;
	}

	static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
	SelectionDAG &DAG, unsigned HOpcode,
	SDValue V0, SDValue V1) {
	// If either input vector is not the same size as the build vector,
	// extract/insert the low bits to the correct size.
	// This is free (examples: zmm --> xmm, xmm --> ymm).
	MVT VT = BV->getSimpleValueType(0);
	unsigned Width = VT.getSizeInBits();
	if (V0.getValueSizeInBits() > Width)
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
	else if (V0.getValueSizeInBits() < Width)
	V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

	if (V1.getValueSizeInBits() > Width)
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
	else if (V1.getValueSizeInBits() < Width)
	V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

	unsigned NumElts = VT.getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	for (unsigned i = 0; i != NumElts; ++i)
	if (BV->getOperand(i).isUndef())
	DemandedElts.clearBit(i);

	// If we don't need the upper xmm, then perform as a xmm hop.
	unsigned HalfNumElts = NumElts / 2;
	if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
	MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
	SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
	return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
	}

	return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We need at least 2 non-undef elements to make this worthwhile by default.
	unsigned NumNonUndefs =
	count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
	if (NumNonUndefs < 2)
	return SDValue();

	// There are 4 sets of horizontal math operations distinguished by type:
	// int/FP at 128-bit/256-bit. Each type was introduced with a different
	// subtarget feature. Try to match those "native" patterns first.
	MVT VT = BV->getSimpleValueType(0);
	if (((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) \|\|
	((VT == MVT::v8i16 \|\| VT == MVT::v4i32) && Subtarget.hasSSSE3()) \|\|
	((VT == MVT::v8f32 \|\| VT == MVT::v4f64) && Subtarget.hasAVX()) \|\|
	((VT == MVT::v16i16 \|\| VT == MVT::v8i32) && Subtarget.hasAVX2())) {
	unsigned HOpcode;
	SDValue V0, V1;
	if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
	return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
	}

	// Try harder to match 256-bit ops by using extract/concat.
	if (!Subtarget.hasAVX() \|\| !VT.is256BitVector())
	return SDValue();

	// Count the number of UNDEF operands in the build_vector in input.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned Half = NumElts / 2;
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
	InVec1) &&
	isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binops followed by
	// a concat vector. We must adjust the outputs from the partial horizontal
	// matching calls above to account for undefined vector halves.
	SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
	SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
	assert((!V0.isUndef() \|\| !V1.isUndef()) && "Horizontal-op of undefs?");
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
	isUndefHI);
	}
	}

	if (VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) {
	unsigned X86Opcode;
	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	bool IsShift = false;
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	IsShift = true;
	break;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	// Don't do this if the buildvector is a splat - we'd replace one
	// constant with an entire vector.
	if (Op->getSplatValue())
	return SDValue();
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();

	// Extend shift amounts.
	if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
	if (!IsShift)
	return SDValue();
	RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
	}

	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	// Limit to shifts by uniform immediates.
	// TODO: Only accept vXi8/vXi64 special cases?
	// TODO: Permit non-uniform XOP/AVX2/MULLO cases?
	if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
	return SDValue();

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode())) {
	// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
	// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v16i32 \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()))
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
	/// from a vector of source values and a vector of extraction indices.
	/// The vectors might be manipulated to match the type of the permute op.
	static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
	SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT ShuffleVT = VT;
	EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Adjust IndicesVec to match VT size.
	assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
	"Illegal variable permute mask size");
	if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
	IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
	NumElts * VT.getScalarSizeInBits());
	IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

	// Handle SrcVec that don't match VT type.
	if (SrcVec.getValueSizeInBits() != SizeInBits) {
	if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
	// Handle larger SrcVec by treating it as a larger permute.
	unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
	VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
	IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
	Subtarget, DAG, SDLoc(IndicesVec));
	return extractSubVector(
	createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
	DAG, DL, SizeInBits);
	} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
	// Widen smaller SrcVec to match VT.
	SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
	} else
	return SDValue();
	}

	auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
	assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
	EVT SrcVT = Idx.getValueType();
	unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
	uint64_t IndexScale = 0;
	uint64_t IndexOffset = 0;

	// If we're scaling a smaller permute op, then we need to repeat the
	// indices, scaling and offsetting them as well.
	// e.g. v4i32 -> v16i8 (Scale = 4)
	// IndexScale = v4i32 Splat(4 << 24 \| 4 << 16 \| 4 << 8 \| 4)
	// IndexOffset = v4i32 Splat(3 << 24 \| 2 << 16 \| 1 << 8 \| 0)
	for (uint64_t i = 0; i != Scale; ++i) {
	IndexScale \|= Scale << (i * NumDstBits);
	IndexOffset \|= i << (i * NumDstBits);
	}

	Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
	Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
	return Idx;
	};

	unsigned Opcode = 0;
	switch (VT.SimpleTy) {
	default:
	break;
	case MVT::v16i8:
	if (Subtarget.hasSSSE3())
	Opcode = X86ISD::PSHUFB;
	break;
	case MVT::v8i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	if (Subtarget.hasAVX()) {
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v4f32;
	} else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v2f64:
	case MVT::v2i64:
	if (Subtarget.hasAVX()) {
	// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v2f64;
	} else if (Subtarget.hasSSE41()) {
	// SSE41 can compare v2i64 - select between indices 0 and 1.
	return DAG.getSelectCC(
	DL, IndicesVec,
	getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
	ISD::CondCode::SETEQ);
	}
	break;
	case MVT::v32i8:
	if (Subtarget.hasVLX() && Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasXOP()) {
	SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
	SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, DL, VT,
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
	} else if (Subtarget.hasAVX()) {
	SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
	SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
	auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Permute Lo and Hi and then select based on index range.
	// This works as SHUFB uses bits[3:0] to permute elements and we don't
	// care about the bit[7] as its just an index vector.
	SDValue Idx = Ops[2];
	EVT VT = Idx.getValueType();
	return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
	ISD::CondCode::SETGT);
	};
	SDValue Ops[] = {LoLo, HiHi, IndicesVec};
	return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
	PSHUFBBuilder);
	}
	break;
	case MVT::v16i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	// Scale to v32i8 and perform as v32i8.
	IndicesVec = ScaleIndices(IndicesVec, 2);
	return DAG.getBitcast(
	VT, createVariablePermute(
	MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
	DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
	}
	break;
	case MVT::v8f32:
	case MVT::v8i32:
	if (Subtarget.hasAVX2())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
	SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{0, 1, 2, 3, 0, 1, 2, 3});
	SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{4, 5, 6, 7, 4, 5, 6, 7});
	if (Subtarget.hasXOP())
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
	LoLo, HiHi, IndicesVec,
	DAG.getConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPS only uses index bits[0:1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v4i64:
	case MVT::v4f64:
	if (Subtarget.hasAVX512()) {
	if (!Subtarget.hasVLX()) {
	MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
	SDLoc(SrcVec));
	IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
	DAG, SDLoc(IndicesVec));
	SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
	DAG, Subtarget);
	return extract256BitVector(Res, 0, DAG, DL);
	}
	Opcode = X86ISD::VPERMV;
	} else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
	SDValue LoLo =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
	SDValue HiHi =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
	// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	if (Subtarget.hasXOP())
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
	LoLo, HiHi, IndicesVec,
	DAG.getConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPD only uses index bit[1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v64i8:
	if (Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v32i16:
	if (Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8f64:
	case MVT::v8i64:
	if (Subtarget.hasAVX512())
	Opcode = X86ISD::VPERMV;
	break;
	}
	if (!Opcode)
	return SDValue();

	assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
	(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
	"Illegal variable permute shuffle type");

	uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
	if (Scale > 1)
	IndicesVec = ScaleIndices(IndicesVec, Scale);

	EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
	IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

	SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
	SDValue Res = Opcode == X86ISD::VPERMV
	? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
	: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
	return DAG.getBitcast(VT, Res);
	}

	// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
	// reasoned to be a permutation of a vector by indices in a non-constant vector.
	// (build_vector (extract_elt V, (extract_elt I, 0)),
	// (extract_elt V, (extract_elt I, 1)),
	// ...
	// ->
	// (vpermv I, V)
	//
	// TODO: Handle undefs
	// TODO: Utilize pshufb and zero mask blending to support more efficient
	// construction of vectors with constant-0 elements.
	static SDValue
	LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue SrcVec, IndicesVec;
	// Check for a match of the permute source vector and permute index elements.
	// This is done by checking that the i-th build_vector operand is of the form:
	// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
	for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
	SDValue Op = V.getOperand(Idx);
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract encountered in V, set the source vector,
	// otherwise verify the extract is from the previously defined source
	// vector.
	if (!SrcVec)
	SrcVec = Op.getOperand(0);
	else if (SrcVec != Op.getOperand(0))
	return SDValue();
	SDValue ExtractedIndex = Op->getOperand(1);
	// Peek through extends.
	if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND \|\|
	ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
	ExtractedIndex = ExtractedIndex.getOperand(0);
	if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract from the index vector candidate, set the
	// indices vector, otherwise verify the extract is from the previously
	// defined indices vector.
	if (!IndicesVec)
	IndicesVec = ExtractedIndex.getOperand(0);
	else if (IndicesVec != ExtractedIndex.getOperand(0))
	return SDValue();

	auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
	if (!PermIdx \|\| PermIdx->getZExtValue() != Idx)
	return SDValue();
	}

	SDLoc DL(V);
	MVT VT = V.getSimpleValueType();
	return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
	return BitOp;

	unsigned EVTBits = EltVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	unsigned NumConstants = NumElems;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
	IsAllConstants = false;
	NumConstants--;
	}
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// If we are inserting one variable into a vector of non-zero constants, try
	// to avoid loading each constant element as a scalar. Load the constants as a
	// vector and then insert the variable scalar element. If insertion is not
	// supported, fall back to a shuffle to get the scalar blended with the
	// constants. Insertion into a zero vector is handled as a special-case
	// somewhere below here.
	if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
	(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) \|\|
	isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
	// Create an all-constant vector. The variable element in the old
	// build vector is replaced by undef in the constant vector. Save the
	// variable scalar element and its index for use in the insertelement.
	LLVMContext &Context = *DAG.getContext();
	Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
	SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
	SDValue VarElt;
	SDValue InsIndex;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (auto *C = dyn_cast<ConstantSDNode>(Elt))
	ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
	else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
	ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
	else if (!Elt.isUndef()) {
	assert(!VarElt.getNode() && !InsIndex.getNode() &&
	"Expected one variable element in this vector");
	VarElt = Elt;
	InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
	}
	}
	Constant *CV = ConstantVector::get(ConstVecOps);
	SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

	// The constants we just created may not be legal (eg, floating point). We
	// must lower the vector right here because we can not guarantee that we'll
	// legalize it before loading it. This is also why we could not just create
	// a new build vector here. If the build vector contains illegal constants,
	// it could get split back up into a series of insert elements.
	// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
	SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
	SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
	unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
	unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
	if (InsertC < NumEltsInLow128Bits)
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

	// There's no good way to insert into the high elements of a >128-bit
	// vector, so use shuffles to avoid an extract/insert sequence.
	assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
	assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
	SmallVector<int, 8> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i)
	ShuffleMask.push_back(i == InsertC ? NumElts : i);
	SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
	return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
	}

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	(EltVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (EltVT == MVT::i16 \|\| EltVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	if (VT.getSizeInBits() >= 256) {
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	if (Subtarget.hasAVX()) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	} else {
	// Without AVX, we need to extend to a 128-bit vector and then
	// insert into the 256-bit vector.
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
	Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
	}
	} else {
	assert(VT.is128BitVector() && "Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
	return V;

	// See if we can use a vector load to get all of the elements.
	{
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// If this is a splat of pairs of 32-bit elements, we can use a narrower
	// build_vector and broadcast it.
	// TODO: We could probably generalize this more.
	if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
	// Make sure all the even/odd operands match.
	for (unsigned i = 2; i != NumElems; ++i)
	if (Ops[i % 2] != Op.getOperand(i))
	return false;
	return true;
	};
	if (CanSplat(Op, NumElems, Ops)) {
	MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
	MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
	// Create a new build vector and cast to v2i64/v2f64.
	SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
	DAG.getBuildVector(NarrowVT, dl, Ops));
	// Broadcast from v2i64/v2f64 and cast to final VT.
	MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
	NewBV));
	}
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.getSizeInBits() > 128) {
	MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

	// Recreate the wider vector with the lower and upper part.
	return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
	VT.getSizeInBits() / 2);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros >> (i*2)) & 0x3) {
	default: llvm_unreachable("Unexpected NonZero count");
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	assert(Values.size() > 1 && "Expected non-undef and non-splat vector");

	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	// TODO: Detect subvector broadcast here instead of DAG combine?
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	unsigned NumOperands = Op.getNumOperands();
	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	unsigned NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= 1 << i;
	++NumNonZero;
	}
	}

	// If we have more than 2 non-zeros, build each half separately.
	if (NumNonZero > 2) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	// Otherwise, build it up through insert_subvectors.
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);

	MVT SubVT = Op.getOperand(0).getSimpleValueType();
	unsigned NumSubElems = SubVT.getVectorNumElements();
	for (unsigned i = 0; i != NumOperands; ++i) {
	if ((NonZeros & (1 << i)) == 0)
	continue;

	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
	Op.getOperand(i),
	DAG.getIntPtrConstant(i * NumSubElems, dl));
	}

	return Vec;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	// TODO: Merge this with LowerAVXCONCAT_VECTORS?
	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOperands = Op.getNumOperands();

	assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= (uint64_t)1 << i;
	++NumNonZero;
	}
	}


	// If there are zero or one non-zeros we can handle this very simply.
	if (NumNonZero <= 1) {
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);
	if (!NumNonZero)
	return Vec;
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
	DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
	}

	if (NumOperands > 2) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	assert(NumNonZero == 2 && "Simple cases not handled?");

	if (ResVT.getVectorNumElements() >= 16)
	return Op; // The operation is legal with KUNPCK

	SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
	DAG.getUNDEF(ResVT), Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	unsigned NumElems = ResVT.getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
	DAG.getIntPtrConstant(NumElems/2, dl));
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
	SmallVector<int, 32> RepeatedMask;
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;
	assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
	"Illegal target shuffle mask");

	for (int i = 0; i < Size; ++i)
	if (Mask[i] == SM_SentinelUndef)
	continue;
	else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
	return false;
	else if (Mask[i] != ExpectedMask[i])
	return false;

	return true;
	}

	// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
	// mask.
	static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = Mask.size();
	assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");

	SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
	TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
	}
	return TargetMask;
	}

	// Attempt to create a shuffle mask from a VSELECT condition mask.
	static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
	SDValue Cond) {
	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return false;

	unsigned Size = Cond.getValueType().getVectorNumElements();
	Mask.resize(Size, SM_SentinelUndef);

	for (int i = 0; i != (int)Size; ++i) {
	SDValue CondElt = Cond.getOperand(i);
	Mask[i] = i;
	// Arbitrarily choose from the 2nd operand if the select condition element
	// is undef.
	// TODO: Can we do better by matching patterns such as even/odd?
	if (CondElt.isUndef() \|\| isNullConstant(CondElt))
	Mask[i] += Size;
	}

	return true;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
	// Create 128-bit vector type based on mask size.
	MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
	MVT VT = MVT::getVectorVT(EltVT, Mask.size());

	// We can't assume a canonical shuffle mask, so try the commuted version too.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);

	// Match any of unary/binary or low/high.
	for (unsigned i = 0; i != 4; ++i) {
	SmallVector<int, 16> UnpackMask;
	createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
	if (isTargetShuffleEquivalent(Mask, UnpackMask) \|\|
	isTargetShuffleEquivalent(CommutedMask, UnpackMask))
	return true;
	}
	return false;
	}

	/// Return true if a shuffle mask chooses elements identically in its top and
	/// bottom halves. For example, any splat mask has the same top and bottom
	/// halves. If an element is undefined in only one half of the mask, the halves
	/// are not considered identical.
	static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
	assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
	unsigned HalfSize = Mask.size() / 2;
	for (unsigned i = 0; i != HalfSize; ++i) {
	if (Mask[i] != Mask[i + HalfSize])
	return false;
	}
	return true;
	}

	/// Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	/// Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2) {
	APInt Zeroable(Mask.size(), 0);
	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Mask.size();
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0 \|\| (M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	Zeroable.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef() \|\| X86::isZeroNode(Op))
	Zeroable.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllZeroable = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllZeroable &= (Op.isUndef() \|\| X86::isZeroNode(Op));
	}
	if (AllZeroable)
	Zeroable.setBit(i);
	continue;
	}
	}

	return Zeroable;
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
	}

	static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1, SDValue V2,
	SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
	int Delta) {
	int Size = (int)Mask.size();
	int Split = Size / Delta;
	int TruncatedVectorStart = SwappedOps ? Size : 0;

	// Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
	if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
	return false;

	// The rest of the mask should not refer to the truncated vector's elements.
	if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
	TruncatedVectorStart + Size))
	return false;

	return true;
	}

	// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
	//
	// An example is the following:
	//
	// t0: ch = EntryToken
	// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
	// t25: v4i32 = truncate t2
	// t41: v8i16 = bitcast t25
	// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
	// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
	// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
	// t18: v2i64 = bitcast t51
	//
	// Without avx512vl, this is lowered to:
	//
	// vpmovqd %zmm0, %ymm0
	// vpshufb {{.*#+}} xmm0 =
	// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
	//
	// But when avx512vl is available, one can just use a single vpmovdw
	// instruction.
	static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (VT != MVT::v16i8 && VT != MVT::v8i16)
	return SDValue();

	if (Mask.size() != VT.getVectorNumElements())
	return SDValue();

	bool SwappedOps = false;

	if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
	if (!ISD::isBuildVectorAllZeros(V1.getNode()))
	return SDValue();

	std::swap(V1, V2);
	SwappedOps = true;
	}

	// Look for:
	//
	// bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
	// bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
	//
	// and similar ones.
	if (V1.getOpcode() != ISD::BITCAST)
	return SDValue();
	if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue Src = V1.getOperand(0).getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// The vptrunc** instructions truncating 128 bit and 256 bit vectors
	// are only available with avx512vl.
	if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
	return SDValue();

	// Down Convert Word to Byte is only available with avx512bw. The case with
	// 256-bit output doesn't contain a shuffle and is therefore not handled here.
	if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
	!Subtarget.hasBWI())
	return SDValue();

	// The first half/quarter of the mask should refer to every second/fourth
	// element of the vector truncated and bitcasted.
	if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
	!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
	return SDValue();

	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
	}

	// X86 has dedicated pack instructions that can handle specific truncation
	// operations: PACKSS and PACKUS.
	static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
	SDValue &V2, unsigned &PackOpcode,
	ArrayRef<int> TargetMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned BitSize = VT.getScalarSizeInBits();
	MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
	MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);

	auto MatchPACK = [&](SDValue N1, SDValue N2) {
	SDValue VV1 = DAG.getBitcast(PackVT, N1);
	SDValue VV2 = DAG.getBitcast(PackVT, N2);
	if (Subtarget.hasSSE41() \|\| PackSVT == MVT::i16) {
	APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
	if ((N1.isUndef() \|\| DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
	(N2.isUndef() \|\| DAG.MaskedValueIsZero(VV2, ZeroMask))) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKUS;
	return true;
	}
	}
	if ((N1.isUndef() \|\| DAG.ComputeNumSignBits(VV1) > BitSize) &&
	(N2.isUndef() \|\| DAG.ComputeNumSignBits(VV2) > BitSize)) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKSS;
	return true;
	}
	return false;
	};

	// Try binary shuffle.
	SmallVector<int, 32> BinaryMask;
	createPackShuffleMask(VT, BinaryMask, false);
	if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
	if (MatchPACK(V1, V2))
	return true;

	// Try unary shuffle.
	SmallVector<int, 32> UnaryMask;
	createPackShuffleMask(VT, UnaryMask, true);
	if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
	if (MatchPACK(V1, V1))
	return true;

	return false;
	}

	static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	SDValue V1, SDValue V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT PackVT;
	unsigned PackOpcode;
	if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
	Subtarget))
	return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
	DAG.getBitcast(PackVT, V2));

	return SDValue();
	}

	/// Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT MaskVT = VT;
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero, AllOnes;
	// Use f64 if i64 isn't legal.
	if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
	EltVT = MVT::f64;
	MaskVT = MVT::getVectorVT(EltVT, Mask.size());
	}

	MVT LogicVT = VT;
	if (EltVT == MVT::f32 \|\| EltVT == MVT::f64) {
	Zero = DAG.getConstantFP(0.0, DL, EltVT);
	AllOnes = DAG.getConstantFP(
	APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
	LogicVT =
	MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
	} else {
	Zero = DAG.getConstant(0, DL, EltVT);
	AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	}

	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
	VMask = DAG.getBitcast(LogicVT, VMask);
	V = DAG.getBitcast(LogicVT, V);
	SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
	return DAG.getBitcast(VT, And);
	}

	/// Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> TargetMask,
	bool &ForceV1Zero, bool &ForceV2Zero,
	uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	int M = TargetMask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (M == SM_SentinelZero) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	TargetMask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	TargetMask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
	int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);

	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v4f64:
	case MVT::v8f32:
	assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
	LLVM_FALLTHROUGH;
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	}
	// Use PBLENDW for lower/upper lanes and then blend lanes.
	// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
	// merge to VSELECT where useful.
	uint64_t LoMask = BlendMask & 0xFF;
	uint64_t HiMask = (BlendMask >> 8) & 0xFF;
	if (LoMask == 0 \|\| LoMask == 255 \|\| HiMask == 0 \|\| HiMask == 255) {
	SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(LoMask, DL, MVT::i8));
	SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(HiMask, DL, MVT::i8));
	return DAG.getVectorShuffle(
	MVT::v16i16, DL, Lo, Hi,
	{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v32i8:
	assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v16i8: {
	assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// x86 allows load folding with blendvb from the 2nd source operand. But
	// we are still using LLVM select here (see comment below), so that's V1.
	// If V2 can be load-folded and V1 cannot be load-folded, then commute to
	// allow that load-folding possibility.
	if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	}

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if (!OptForSize) {
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;
	}

	// Otherwise load an immediate into a GPR, cast to k-register, and use a
	// masked move.
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG,
	bool ImmBlends = false) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	// If only immediate blends, then bail if the blend mask can't be widened to
	// i16.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
	return SDValue();

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Try to lower as an unpack of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can unpack elements from two inputs and
	/// then reduce the shuffle to a single-input (wider) permutation.
	static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	int NumElts = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;
	int NumHalfLaneElts = NumLaneElts / 2;

	bool MatchLo = true, MatchHi = true;
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

	// Determine UNPCKL/UNPCKH type and operand order.
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;

	SDValue &Op = Ops[Elt & 1];
	if (M < NumElts && (Op.isUndef() \|\| Op == V1))
	Op = V1;
	else if (NumElts <= M && (Op.isUndef() \|\| Op == V2))
	Op = V2;
	else
	return SDValue();

	int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
	MatchLo &= isUndefOrInRange(M, Lo, Mid) \|\|
	isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
	MatchHi &= isUndefOrInRange(M, Mid, Hi) \|\|
	isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
	if (!MatchLo && !MatchHi)
	return SDValue();
	}
	}
	assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");

	// Now check that each pair of elts come from the same unpack pair
	// and set the permute mask based on each pair.
	// TODO - Investigate cases where we permute individual elements.
	SmallVector<int, 32> PermuteMask(NumElts, -1);
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
	int M0 = Mask[Lane + Elt + 0];
	int M1 = Mask[Lane + Elt + 1];
	if (0 <= M0 && 0 <= M1 &&
	(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
	return SDValue();
	if (0 <= M0)
	PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
	if (0 <= M1)
	PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
	}
	}

	unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
	return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
	/// permuting the elements of the result in place.
	static SDValue lowerShuffleAsByteRotateAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) \|\|
	(VT.is256BitVector() && !Subtarget.hasAVX2()) \|\|
	(VT.is512BitVector() && !Subtarget.hasBWI()))
	return SDValue();

	// We don't currently support lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	int Scale = VT.getScalarSizeInBits() / 8;
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = VT.getVectorNumElements();
	int NumEltsPerLane = NumElts / NumLanes;

	// Determine range of mask elts.
	bool Blend1 = true;
	bool Blend2 = true;
	std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
	std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts) {
	Blend1 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range1.first = std::min(Range1.first, M);
	Range1.second = std::max(Range1.second, M);
	} else {
	M -= NumElts;
	Blend2 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range2.first = std::min(Range2.first, M);
	Range2.second = std::max(Range2.second, M);
	}
	}
	}

	// Bail if we don't need both elements.
	// TODO - it might be worth doing this for unary shuffles if the permute
	// can be widened.
	if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) \|\|
	!(0 <= Range2.first && Range2.second < NumEltsPerLane))
	return SDValue();

	if (VT.getSizeInBits() > 128 && (Blend1 \|\| Blend2))
	return SDValue();

	// Rotate the 2 ops so we can access both ranges, then permute the result.
	auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue Rotate = DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
	DAG.getBitcast(ByteVT, Lo),
	DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
	SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts)
	PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
	else
	PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
	}
	}
	return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
	};

	// Check if the ranges are small enough to rotate from either direction.
	if (Range2.second < Range1.first)
	return RotateAndPermute(V1, V2, Range1.first, 0);
	if (Range1.second < Range2.first)
	return RotateAndPermute(V2, V1, Range2.first, NumElts);
	return SDValue();
	}

	/// Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerShuffleAsDecomposedShuffleBlend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend/unpack/rotate strategies unless
	// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
	// the shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
	// pre-shuffle first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
	// Only prefer immediate blends to unpack/rotate.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG, true))
	return BlendPerm;
	if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return UnpackPerm;
	if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
	DL, VT, V1, V2, Mask, Subtarget, DAG))
	return RotatePerm;
	// Unpack/rotate failed - try again with variable blends.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return BlendPerm;
	}

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// Try to lower a vector shuffle as a rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getConstant(Rotation, DL, MVT::i8));
	}

	/// Try to lower a vector shuffle as a byte shift sequence.
	static SDValue lowerVectorShuffleAsByteShiftMask(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
	assert(VT.is128BitVector() && "Only 128-bit vectors supported");

	// We need a shuffle that has zeros at one/both ends and a sequential
	// shuffle from one source within.
	unsigned ZeroLo = Zeroable.countTrailingOnes();
	unsigned ZeroHi = Zeroable.countLeadingOnes();
	if (!ZeroLo && !ZeroHi)
	return SDValue();

	unsigned NumElts = Mask.size();
	unsigned Len = NumElts - (ZeroLo + ZeroHi);
	if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
	return SDValue();

	unsigned Scale = VT.getScalarSizeInBits() / 8;
	ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
	if (!isUndefOrInRange(StubMask, 0, NumElts) &&
	!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
	return SDValue();

	SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
	Res = DAG.getBitcast(MVT::v16i8, Res);

	// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
	// inner sequential set of elements, possibly offset:
	// 01234567 --> zzzzzz01 --> 1zzzzzzz
	// 01234567 --> 4567zzzz --> zzzzz456
	// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
	if (ZeroLo == 0) {
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
	} else if (ZeroHi == 0) {
	unsigned Shift = Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
	} else if (!Subtarget.hasSSSE3()) {
	// If we don't have PSHUFB then its worth avoiding an AND constant mask
	// by performing 3 byte shifts. Shuffle combining can kick in above that.
	// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * Shift, DL, MVT::i8));
	Shift += Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
	} else
	return SDValue();

	return DAG.getBitcast(VT, Res);
	}

	/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	// TODO: Add AnyExt support.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(LoIdx, DL, MVT::i8)));

	if (isUndefUpperHalf(Mask) \|\| !SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	PSHUFBMask[i] = DAG.getConstant(
	(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
	InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	if (!VT.is128BitVector())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(
	X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
	int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	EVT EltVT = VT.getVectorElementType();
	EVT V0VT = V0.getValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	EVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// If we are extracting two 128-bit halves of a vector and shuffling the
	/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
	/// multi-shuffle lowering.
	static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
	SDValue N1, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	EVT VT = N0.getValueType();
	assert((VT.is128BitVector() &&
	(VT.getScalarSizeInBits() == 32 \|\| VT.getScalarSizeInBits() == 64)) &&
	"VPERM* family of shuffles requires 32-bit or 64-bit elements");

	// Check that both sources are extracts of the same source vector.
	if (!N0.hasOneUse() \|\| !N1.hasOneUse() \|\|
	N0.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N1.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N0.getOperand(0) != N1.getOperand(0))
	return SDValue();

	SDValue WideVec = N0.getOperand(0);
	EVT WideVT = WideVec.getValueType();
	if (!WideVT.is256BitVector() \|\| !isa<ConstantSDNode>(N0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(N1.getOperand(1)))
	return SDValue();

	// Match extracts of each half of the wide source vector. Commute the shuffle
	// if the extract of the low half is N1.
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
	const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
	const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
	if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
	ShuffleVectorSDNode::commuteMask(NewMask);
	else if (ExtIndex0 != 0 \|\| ExtIndex1 != NumElts)
	return SDValue();

	// Final bailout: if the mask is simple, we are better off using an extract
	// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
	// because that avoids a constant load from memory.
	if (NumElts == 4 &&
	(isSingleSHUFPSMask(NewMask) \|\| is128BitUnpackShuffleMask(NewMask)))
	return SDValue();

	// Extend the shuffle mask with undef elements.
	NewMask.append(NumElts, -1);

	// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
	SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
	NewMask);
	// This is free: ymm -> xmm.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
	DAG.getIntPtrConstant(0, DL));
	}

	/// Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumElts = Mask.size();
	unsigned NumEltBits = VT.getScalarSizeInBits();
	unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = -1;
	for (int i = 0; i != (int)NumElts; ++i) {
	SmallVector<int, 8> BroadcastMask(NumElts, i);
	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
	BroadcastIdx = i;
	break;
	}
	}

	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	int BitOffset = BroadcastIdx * NumEltBits;
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	V = V.getOperand(0);
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OpBitWidth = V.getOperand(0).getValueSizeInBits();
	int OpIdx = BitOffset / OpBitWidth;
	V = V.getOperand(OpIdx);
	BitOffset %= OpBitWidth;
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
	if (!ConstantIdx)
	break;

	int EltBitWidth = VOuter.getScalarValueSizeInBits();
	int Idx = (int)ConstantIdx->getZExtValue();
	int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
	int BeginOffset = Idx * EltBitWidth;
	int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
	if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
	BitOffset -= BeginOffset;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}
	assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
	BroadcastIdx = BitOffset / NumEltBits;

	// Do we need to bitcast the source to retrieve the original broadcast index?
	bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// If the original value has a larger element type than the shuffle, the
	// broadcast element is in essence truncated. Make that explicit to ease
	// folding.
	if (BitCastSrc && VT.isInteger())
	if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
	DL, VT, V, BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	MVT BroadcastVT = VT;

	// Also check the simpler case, where we can directly reuse the scalar.
	if (!BitCastSrc &&
	((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
	Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: Opcode;
	}

	// If we are broadcasting a load that is only used by the shuffle
	// then we can reduce the vector load to the broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(V);
	SDValue BaseAddr = Ld->getOperand(1);
	EVT SVT = BroadcastVT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BitOffset != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	if ((BitOffset % 128) != 0)
	return SDValue();

	assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
	"Unexpected bit-offset");
	assert((V.getValueSizeInBits() == 256 \|\| V.getValueSizeInBits() == 512) &&
	"Unexpected vector size");
	unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
	V = extract128BitVector(V, ExtractIdx, DAG, DL);
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// Bitcast back to the same scalar type as BroadcastVT.
	if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
	assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	MVT ExtVT;
	if (V.getValueType().isVector()) {
	unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
	ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
	} else {
	ExtVT = BroadcastVT.getScalarType();
	}
	V = DAG.getBitcast(ExtVT, V);
	}

	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
	V = DAG.getBitcast(MVT::f64, V);
	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits, removing as many bitcasts as possible.
	if (V.getValueSizeInBits() > 128) {
	MVT ExtVT = V.getSimpleValueType().getScalarType();
	ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
	V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
	V = DAG.getBitcast(ExtVT, V);
	}

	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> Mask, const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	/// Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerShuffleAsPermuteAndUnpack(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If we're shuffling with a zero vector then we're better off not doing
	// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
	if (ISD::isBuildVectorAllZeros(V1.getNode()) \|\|
	ISD::isBuildVectorAllZeros(V2.getNode()))
	return SDValue();

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	X86ISD::MOVSD, DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
	// in SSE1 because otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
	V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions. These are only valid in SSE1 because
	// otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	// Attempt to directly match PSHUFLW or PSHUFHW.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
	}
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	for (int i = 0; i != 4; ++i)
	HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
	return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
	}

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are shuffling values from one half - check how many different DWORD
	// pairs we need to create. If only 1 or 2 then we can perform this as a
	// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
	auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
	ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if ((NumHToL + NumHToH) == 0 \|\| (NumLToL + NumLToH) == 0) {
	int PSHUFDMask[4] = { -1, -1, -1, -1 };
	SmallVector<std::pair<int, int>, 4> DWordPairs;
	int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

	// Collect the different DWORD pairs.
	for (int DWord = 0; DWord != 4; ++DWord) {
	int M0 = Mask[2 * DWord + 0];
	int M1 = Mask[2 * DWord + 1];
	M0 = (M0 >= 0 ? M0 % 4 : M0);
	M1 = (M1 >= 0 ? M1 % 4 : M1);
	if (M0 < 0 && M1 < 0)
	continue;

	bool Match = false;
	for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
	auto &DWordPair = DWordPairs[j];
	if ((M0 < 0 \|\| isUndefOrEqual(DWordPair.first, M0)) &&
	(M1 < 0 \|\| isUndefOrEqual(DWordPair.second, M1))) {
	DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
	DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
	PSHUFDMask[DWord] = DOffset + j;
	Match = true;
	break;
	}
	}
	if (!Match) {
	PSHUFDMask[DWord] = DOffset + DWordPairs.size();
	DWordPairs.push_back(std::make_pair(M0, M1));
	}
	}

	if (DWordPairs.size() <= 2) {
	DWordPairs.resize(2, std::make_pair(-1, -1));
	int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
	DWordPairs[1].first, DWordPairs[1].second};
	if ((NumHToL + NumHToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
	if ((NumLToL + NumLToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
	}
	}

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord = 0, BDWord = 0;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
	assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
	"Lane crossing shuffle masks not supported");

	int NumBytes = VT.getSizeInBits() / 8;
	int Size = Mask.size();
	int Scale = NumBytes / Size;

	SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	V1InUse = false;
	V2InUse = false;

	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Scale];
	if (M < 0)
	continue;

	const int ZeroMask = 0x80;
	int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
	int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;

	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}

	MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
	DAG.getBuildVector(ShufVT, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
	DAG.getBuildVector(ShufVT, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
	Subtarget, DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerVectorShuffleAsByteShiftMask(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG);
	}

	/// Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, V1, V1);

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerVectorShuffleAsByteShiftMask(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3()) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Unpack;

	// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);

	// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
	// PALIGNR will be cheaper than the second PSHUFB+OR.
	if (SDValue V = lowerShuffleAsByteRotateAndPermute(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return V;
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Blend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	bool IsSingleInput = V2.isUndef();
	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	// We use the mask type to pick which bytes are preserved based on how many
	// elements are dropped.
	MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
	SDValue ByteClearMask = DAG.getBitcast(
	MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

	// Now pack things back together.
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}

	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

	// Rather than splitting build-vectors, just build two narrower build
	// vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	V = peekThroughBitcasts(V);

	MVT OrigVT = V.getSimpleValueType();
	int OrigNumElements = OrigVT.getVectorNumElements();
	int OrigSplitNumElements = OrigNumElements / 2;
	MVT OrigScalarVT = OrigVT.getVectorElementType();
	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

	SDValue LoV, HiV;

	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV) {
	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(0, DL));
	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
	} else {

	SmallVector<SDValue, 16> LoOps, HiOps;
	for (int i = 0; i < OrigSplitNumElements; ++i) {
	LoOps.push_back(BV->getOperand(i));
	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
	}
	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
	}
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	Subtarget, DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
	DAG);
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a lane permutation followed by a per-lane permutation.
	///
	/// This is mainly for cases where we can have non-repeating permutes
	/// in each lane.
	///
	/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
	/// we should investigate merging them.
	static SDValue lowerShuffleAsLanePermuteAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumEltsPerLane = NumElts / NumLanes;

	SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
	SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);

	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Ensure that each lane comes from a single source lane.
	int SrcLane = M / NumEltsPerLane;
	int DstLane = i / NumEltsPerLane;
	if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
	return SDValue();
	SrcLaneMask[DstLane] = SrcLane;

	PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
	}

	// Make sure we set all elements of the lane mask, to avoid undef propagation.
	SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
	for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
	int SrcLane = SrcLaneMask[DstLane];
	if (0 <= SrcLane)
	for (int j = 0; j != NumEltsPerLane; ++j) {
	LaneMask[(DstLane * NumEltsPerLane) + j] =
	(SrcLane * NumEltsPerLane) + j;
	}
	}

	// If we're only shuffling a single lowest lane and the rest are identity
	// then don't bother.
	// TODO - isShuffleMaskInputInPlace could be extended to something like this.
	int NumIdentityLanes = 0;
	bool OnlyShuffleLowestLane = true;
	for (int i = 0; i != NumLanes; ++i) {
	if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
	i * NumEltsPerLane))
	NumIdentityLanes++;
	else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
	OnlyShuffleLowestLane = false;
	}
	if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
	return SDValue();

	SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
	return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a permutation and blend of those lanes.
	///
	/// This essentially blends the out-of-lane inputs to each lane into the lane
	/// from a permuted copy of the vector. This lowering strategy results in four
	/// instructions in the worst case for a single-input cross lane shuffle which
	/// is lower than any other fully general cross-lane shuffle strategy I'm aware
	/// of. Special cases for each particular shuffle pattern should be handled
	/// prior to trying this lowering.
	static SDValue lowerShuffleAsLanePermuteAndBlend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	if (!Subtarget.hasAVX2()) {
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	} else {
	bool LaneUsed[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneUsed[(Mask[i] / LaneSize)] = true;
	if (!LaneUsed[0] \|\| !LaneUsed[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> FlippedBlendMask(Size);
	for (int i = 0; i < Size; ++i)
	FlippedBlendMask[i] =
	Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
	? Mask[i]
	: Mask[i] % LaneSize +
	(i / LaneSize) * LaneSize + Size);

	// Flip the vector, and blend the results which should now be in-lane.
	MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
	SDValue Flipped = DAG.getBitcast(PVT, V1);
	Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
	{ 2, 3, 0, 1 });
	Flipped = DAG.getBitcast(VT, Flipped);
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
	}

	/// Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
	return SDValue();

	bool IsLowZero = (Zeroable & 0x3) == 0x3;
	bool IsHighZero = (Zeroable & 0xc) == 0xc;

	// Try to use an insert into a zero vector.
	if (WidenedMask[0] == 0 && IsHighZero) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Blend;

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsLowZero && !IsHighZero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(2, DL));
	}
	}

	// Try to use SHUF128 if possible.
	if (Subtarget.hasVLX()) {
	if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
	unsigned PermMask = ((WidenedMask[0] % 2) << 0) \|
	((WidenedMask[1] % 2) << 1);
	return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	assert((WidenedMask[0] >= 0 \|\| IsLowZero) &&
	(WidenedMask[1] >= 0 \|\| IsHighZero) && "Undef half?");

	unsigned PermMask = 0;
	PermMask \|= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
	PermMask \|= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

	// Check the immediate mask and replace unused sources with undef.
	if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
	V1 = DAG.getUNDEF(VT);
	if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
	V2 = DAG.getUNDEF(VT);

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This attempts to create a repeated lane shuffle where each lane uses one
	/// or two of the lanes of the inputs. The lanes of the input vectors are
	/// shuffled in one or two independent shuffles to get the lanes into the
	/// position needed by the final shuffle.
	static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	if (is128BitLaneRepeatedShuffleMask(VT, Mask))
	return SDValue();

	int Size = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int LaneSize = 128 / VT.getScalarSizeInBits();
	SmallVector<int, 16> RepeatMask(LaneSize, -1);
	SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

	// First pass will try to fill in the RepeatMask from lanes that need two
	// sources.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Srcs[2] = { -1, -1 };
	SmallVector<int, 16> InLaneMask(LaneSize, -1);
	for (int i = 0; i != LaneSize; ++i) {
	int M = Mask[(Lane * LaneSize) + i];
	if (M < 0)
	continue;
	// Determine which of the possible input lanes (NumLanes from each source)
	// this element comes from. Assign that as one of the sources for this
	// lane. We can assign up to 2 sources for this lane. If we run out
	// sources we can't do anything.
	int LaneSrc = M / LaneSize;
	int Src;
	if (Srcs[0] < 0 \|\| Srcs[0] == LaneSrc)
	Src = 0;
	else if (Srcs[1] < 0 \|\| Srcs[1] == LaneSrc)
	Src = 1;
	else
	return SDValue();

	Srcs[Src] = LaneSrc;
	InLaneMask[i] = (M % LaneSize) + Src * Size;
	}

	// If this lane has two sources, see if it fits with the repeat mask so far.
	if (Srcs[1] < 0)
	continue;

	LaneSrcs[Lane][0] = Srcs[0];
	LaneSrcs[Lane][1] = Srcs[1];

	auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
	assert(M1.size() == M2.size() && "Unexpected mask size");
	for (int i = 0, e = M1.size(); i != e; ++i)
	if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
	return false;
	return true;
	};

	auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
	assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
	for (int i = 0, e = MergedMask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;
	assert((MergedMask[i] < 0 \|\| MergedMask[i] == M) &&
	"Unexpected mask element");
	MergedMask[i] = M;
	}
	};

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Didn't find a match. Swap the operands and try again.
	std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
	ShuffleVectorSDNode::commuteMask(InLaneMask);

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Couldn't find a match with the operands in either order.
	return SDValue();
	}

	// Now handle any lanes with only one source.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	// If this lane has already been processed, skip it.
	if (LaneSrcs[Lane][0] >= 0)
	continue;

	for (int i = 0; i != LaneSize; ++i) {
	int M = Mask[(Lane * LaneSize) + i];
	if (M < 0)
	continue;

	// If RepeatMask isn't defined yet we can define it ourself.
	if (RepeatMask[i] < 0)
	RepeatMask[i] = M % LaneSize;

	if (RepeatMask[i] < Size) {
	if (RepeatMask[i] != M % LaneSize)
	return SDValue();
	LaneSrcs[Lane][0] = M / LaneSize;
	} else {
	if (RepeatMask[i] != ((M % LaneSize) + Size))
	return SDValue();
	LaneSrcs[Lane][1] = M / LaneSize;
	}
	}

	if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
	return SDValue();
	}

	SmallVector<int, 16> NewMask(Size, -1);
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][0];
	for (int i = 0; i != LaneSize; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * LaneSize + i;
	NewMask[Lane * LaneSize + i] = M;
	}
	}
	SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV1) &&
	cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
	return SDValue();

	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][1];
	for (int i = 0; i != LaneSize; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * LaneSize + i;
	NewMask[Lane * LaneSize + i] = M;
	}
	}
	SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV2) &&
	cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
	return SDValue();

	for (int i = 0; i != Size; ++i) {
	NewMask[i] = RepeatMask[i % LaneSize];
	if (NewMask[i] < 0)
	continue;

	NewMask[i] += (i / LaneSize) * LaneSize;
	}
	return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
	}

	/// If the input shuffle mask results in a vector that is undefined in all upper
	/// or lower half elements and that mask accesses only 2 halves of the
	/// shuffle's operands, return true. A mask of half the width with mask indexes
	/// adjusted to access the extracted halves of the original shuffle operands is
	/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
	/// lower half of each input operand is accessed.
	static bool
	getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
	int &HalfIdx1, int &HalfIdx2) {
	assert((Mask.size() == HalfMask.size() * 2) &&
	"Expected input mask to be twice as long as output");

	// Exactly one half of the result must be undef to allow narrowing.
	bool UndefLower = isUndefLowerHalf(Mask);
	bool UndefUpper = isUndefUpperHalf(Mask);
	if (UndefLower == UndefUpper)
	return false;

	unsigned HalfNumElts = HalfMask.size();
	unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
	HalfIdx1 = -1;
	HalfIdx2 = -1;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + MaskIndexOffset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return false;
	}

	return true;
	}

	/// Given the output values from getHalfShuffleMask(), create a half width
	/// shuffle of extracted vectors followed by an insert back to full width.
	static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> HalfMask, int HalfIdx1,
	int HalfIdx2, bool UndefLower,
	SelectionDAG &DAG) {
	assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
	assert(V1.getValueType().isSimple() && "Expecting only simple types");

	MVT VT = V1.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

	auto getHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
	SDValue Half1 = getHalfVector(HalfIdx1);
	SDValue Half2 = getHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	unsigned Offset = UndefLower ? HalfNumElts : 0;
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected 256-bit or 512-bit vector");

	bool UndefLower = isUndefLowerHalf(Mask);
	if (!UndefLower && !isUndefUpperHalf(Mask))
	return SDValue();

	assert((!UndefLower \|\| !isUndefUpperHalf(Mask)) &&
	"Completely undef shuffle mask should have been simplified already");

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
	if (!UndefLower &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
	return SDValue();

	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	unsigned NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	unsigned NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);
	assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");

	// Determine the larger pattern of undef/halves, then decide if it's worth
	// splitting the shuffle based on subtarget capabilities and types.
	unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
	if (!UndefLower) {
	// XXXXuuuu: no insert is needed.
	// Always extract lowers when setting lower - these are all free subreg ops.
	if (NumUpperHalves == 0)
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);

	if (NumUpperHalves == 1) {
	// AVX2 has efficient 32/64-bit element cross-lane shuffles.
	if (Subtarget.hasAVX2()) {
	// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
	if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
	!is128BitUnpackShuffleMask(HalfMask) &&
	(!isSingleSHUFPSMask(HalfMask) \|\|
	Subtarget.hasFastVariableShuffle()))
	return SDValue();
	// If this is a unary shuffle (assume that the 2nd operand is
	// canonicalized to undef), then we can use vpermpd. Otherwise, we
	// are better off extracting the upper half of 1 operand and using a
	// narrow shuffle.
	if (EltWidth == 64 && V2.isUndef())
	return SDValue();
	}
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Extract + narrow shuffle is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// Don't extract both uppers, instead shuffle and then extract.
	assert(NumUpperHalves == 2 && "Half vector count went wrong");
	return SDValue();
	}

	// UndefLower - uuuuXXXX: an insert to high half is required if we split this.
	if (NumUpperHalves == 0) {
	// AVX2 has efficient 64-bit element cross-lane shuffles.
	// TODO: Refine to account for unary shuffle, splat, and other masks?
	if (Subtarget.hasAVX2() && EltWidth == 64)
	return SDValue();
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Narrow shuffle + insert is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
	return SDValue();
	}

	/// Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &ShuffleImm, ArrayRef<int> Mask) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (ShufpdMask)
	return true;
	if (CommutableMask) {
	std::swap(V1, V2);
	return true;
	}

	return false;
	}

	static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
	return SDValue();

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getConstant(Immediate, DL, MVT::i8));
	}

	/// Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
	Mask, DAG, Subtarget))
	return V;

	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
	Subtarget);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return Op;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it is a single input, directly
	// generate a cross-lane VPERMD instruction.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
	DAG, Subtarget);
	}

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
	Subtarget);
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512VBMIVL can lower to VPERMB.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
	SDValue V1, SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;
	if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// Try to use an insert into a zero vector.
	if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
	(WidenedMask[1] == 1 \|\| (Zeroable & 0x0c) == 0x0c)) {
	unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(4, DL));
	}

	assert(WidenedMask.size() == 4);

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (WidenedMask[i] < 4) {
	if (WidenedMask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// Try to lower to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Op;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
	if (V2.isUndef() &&
	!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
	}

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;
	return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputShuffle(
	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (!V2.isUndef())
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// FIXME: Implement direct support for this type!
	return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	// Determine if this shuffle can be implemented with a KSHIFT instruction.
	// Returns the shift amount if possible or -1 if not. This is a simplified
	// version of matchShuffleAsShift.
	static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable) {
	int Size = Mask.size();

	auto CheckZeros = [&](int Shift, bool Left) {
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, bool Left) {
	unsigned Pos = Left ? Shift : 0;
	unsigned Low = Left ? 0 : Shift;
	unsigned Len = Size - Shift;
	return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
	};

	for (int Shift = 1; Shift != Size; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
	Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
	return Shift;
	}

	return -1;
	}


	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");

	unsigned NumElts = Mask.size();

	// Try to recognize shuffles that are just padding a subvector with zeros.
	unsigned SubvecElts = 0;
	for (int i = 0; i != (int)NumElts; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i)
	break;

	++SubvecElts;
	}
	assert(SubvecElts != NumElts && "Identity shuffle?");

	// Clip to a power 2.
	SubvecElts = PowerOf2Floor(SubvecElts);

	// Make sure the number of zeroable bits in the top at least covers the bits
	// not covered by the subvector.
	if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
	MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
	V1, DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL),
	Extract, DAG.getIntPtrConstant(0, DL));
	}

	// Try to match KSHIFTs.
	// TODO: Support narrower than legal shifts by widening and extracting.
	if (NumElts >= 16 \|\| (Subtarget.hasDQI() && NumElts == 8)) {
	unsigned Offset = 0;
	for (SDValue V : { V1, V2 }) {
	unsigned Opcode;
	int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
	if (ShiftAmt >= 0)
	return DAG.getNode(Opcode, DL, VT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	Offset += NumElts; // Increment for next iteration.
	}
	}


	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
	// shuffle.
	ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
	break;
	case MVT::v16i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
	break;
	case MVT::v32i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	assert(Subtarget.hasBWI() && "Expected AVX512BW support");
	ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
	break;
	case MVT::v64i1:
	ExtVT = MVT::v64i8;
	break;
	}

	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
	Shuffle, ISD::SETGT);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> Mask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef &&
	any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
	SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
	assert(llvm::all_of(Mask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

	// Create an alternative mask with info about zeroable elements.
	// Here we do not set undef elements as zeroable.
	SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
	if (V2IsZero) {
	assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
	for (int i = 0; i != NumElements; ++i)
	if (Mask[i] != SM_SentinelUndef && Zeroable[i])
	ZeroableMask[i] = SM_SentinelZero;
	}

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(ZeroableMask, WidenedMask)) {
	// Shuffle mask widening should not interfere with a broadcast opportunity
	// by obfuscating the operands with bitcasts.
	// TODO: Avoid lowering directly from this top-level function: make this
	// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	int NewNumElts = NumElements / 2;
	MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	if (V2IsZero) {
	// Modify the new Mask to take all zeros from the all-zero vector.
	// Choose indices that are blend-friendly.
	bool UsedZeroVector = false;
	assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
	"V2's non-undef elements are used?!");
	for (int i = 0; i != NewNumElts; ++i)
	if (WidenedMask[i] == SM_SentinelZero) {
	WidenedMask[i] = i + NewNumElts;
	UsedZeroVector = true;
	}
	// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
	// some elements to be undef.
	if (UsedZeroVector)
	V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
	}
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	if (canonicalizeShuffleMaskWithCommute(Mask))
	return DAG.getCommutedVectorShuffle(*SVOp);

	if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
	return V;

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is256BitVector())
	return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is512BitVector())
	return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (Is1BitVector)
	return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

	return SDValue();
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);

	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
	return SDValue();

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	MVT CondVT = Cond.getSimpleValueType();
	unsigned CondEltSize = Cond.getScalarValueSizeInBits();
	if (CondEltSize == 1)
	return Op;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned NumElts = VT.getVectorNumElements();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	// Build a mask by testing the condition against zero.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
	DAG.getConstant(0, dl, CondVT),
	ISD::SETNE);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, LHS, RHS);
	}

	// SEXT/TRUNC cases where the mask doesn't match the destination size.
	if (CondEltSize != EltSize) {
	// If we don't have a sign splat, rely on the expansion.
	if (CondEltSize != DAG.ComputeNumSignBits(Cond))
	return SDValue();

	MVT NewCondSVT = MVT::getIntegerVT(EltSize);
	MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
	Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
	return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16: {
	// Bitcast everything to the vXi8 type and use a vXi8 vselect.
	MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
	Cond = DAG.getBitcast(CastVT, Cond);
	LHS = DAG.getBitcast(CastVT, LHS);
	RHS = DAG.getBitcast(CastVT, RHS);
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
	return DAG.getBitcast(VT, Select);
	}
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\|
	isNullConstant(Op.getOperand(1))) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
	Op.getOperand(1));
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	// ExtractPS/pextrq works with constant index.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return Op;
	}

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!isa<ConstantSDNode>(Idx)) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	// Extend to natively supported kshift.
	unsigned NumElems = VecVT.getVectorNumElements();
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Use kshiftr instruction to move to the lower element.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG, Subtarget);

	if (!isa<ConstantSDNode>(Idx)) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(IdxVal, dl));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	// Transform it so it match pextrw which produces a 32-bit result.
	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	unsigned NumElts = VecVT.getVectorNumElements();
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	// Copy into a k-register, extract to v1i1 and insert_subvector.
	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
	Op.getOperand(2));
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG, Subtarget);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);

	auto *N2C = dyn_cast<ConstantSDNode>(N2);
	if (!N2C \|\| N2C->getAPIntValue().uge(NumElts))
	return SDValue();
	uint64_t IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: getOnesVector(VT, DAG, dl);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	N2 = DAG.getIntPtrConstant(1, dl);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getIntPtrConstant(IdxIn128, dl));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// This will be just movd/movq/movss/movsd.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
	(EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	EltVT == MVT::i64)) {
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
	}

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N2 = DAG.getIntPtrConstant(1, dl);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
	}
	N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
	"Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Only vXi1 extract_subvectors need custom lowering");

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumElems = VecVT.getVectorNumElements();

	// Extend to natively supported kshift.
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Shift to the LSB.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(
	const GlobalValue *GV, const unsigned char OpFlags) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	// GOTPCREL references must always use RIP.
	if (OpFlags == X86II::MO_GOTPCREL)
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
	SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	/// Creates target global address or external symbol nodes for calls or
	/// other uses.
	SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
	bool ForCall) const {
	// Unpack the global address or external symbol.
	const SDLoc &dl = SDLoc(Op);
	const GlobalValue *GV = nullptr;
	int64_t Offset = 0;
	const char *ExternalSym = nullptr;
	if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
	GV = G->getGlobal();
	Offset = G->getOffset();
	} else {
	const auto *ES = cast<ExternalSymbolSDNode>(Op);
	ExternalSym = ES->getSymbol();
	}

	// Calculate some flags for address lowering.
	const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlags;
	if (ForCall)
	OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
	else
	OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
	bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
	bool NeedsLoad = isGlobalStubReference(OpFlags);

	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;

	if (GV) {
	// Create a target global address if this is a global. If possible, fold the
	// offset into the global address reference. Otherwise, ADD it on later.
	int64_t GlobalOffset = 0;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	std::swap(GlobalOffset, Offset);
	}
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
	} else {
	// If this is not a global address, this must be an external symbol.
	Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
	}

	// If this is a direct call, avoid the wrapper if we don't need to do any
	// loads or adds. This allows SDAG ISel to match direct calls.
	if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
	return Result;

	Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (HasPICReg) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (NeedsLoad)
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isOSWindows()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	/// TODO: Can this be moved to general expansion code?
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
	// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
	DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);

	SDValue Hi, Lo;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	} else {
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	}

	return DAG.getMergeValues({ Lo, Hi }, dl);
	}

	static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((Op.getOpcode() == ISD::FSHL \|\| Op.getOpcode() == ISD::FSHR) &&
	"Unexpected funnel shift opcode!");

	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);

	bool IsFSHR = Op.getOpcode() == ISD::FSHR;

	if (VT.isVector()) {
	assert(Subtarget.hasVBMI2() && "Expected VBMI2");

	if (IsFSHR)
	std::swap(Op0, Op1);

	APInt APIntShiftAmt;
	if (isConstantSplat(Amt, APIntShiftAmt)) {
	uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
	return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
	Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
	}

	return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
	Op0, Op1, Amt);
	}

	assert((VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) &&
	"Unexpected funnel shift type!");

	// Expand slow SHLD/SHRD cases if we are not optimizing for size.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (IsFSHR)
	std::swap(Op0, Op1);

	// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
	if (VT == MVT::i16)
	Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
	DAG.getConstant(15, DL, Amt.getValueType()));

	unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
	return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
	}

	// Try to use a packed vector operation to handle i64 on 32-bit targets when
	// AVX512DQ is enabled.
	static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\|
	Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();

	if (!Subtarget.hasDQI() \|\| SrcVT != MVT::i64 \|\| Subtarget.is64Bit() \|\|
	(VT != MVT::f32 && VT != MVT::f64))
	return SDValue();

	// Pack the i64 into a vector, do the operation and extract.

	// Using 256-bit to ensure result is 128-bits for f32 case.
	unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
	MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecVT = MVT::getVectorVT(VT, NumElts);

	SDLoc dl(Op);
	SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
	SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
	const X86Subtarget &Subtarget) {
	switch (Opcode) {
	case ISD::SINT_TO_FP:
	// TODO: Handle wider types with AVX/AVX512.
	if (!Subtarget.hasSSE2() \|\| FromVT != MVT::v4i32)
	return false;
	// CVTDQ2PS or (V)CVTDQ2PD
	return ToVT == MVT::v4f32 \|\| (Subtarget.hasAVX() && ToVT == MVT::v4f64);

	case ISD::UINT_TO_FP:
	// TODO: Handle wider types and i64 elements.
	if (!Subtarget.hasAVX512() \|\| FromVT != MVT::v4i32)
	return false;
	// VCVTUDQ2PS or VCVTUDQ2PD
	return ToVT == MVT::v4f32 \|\| ToVT == MVT::v4f64;

	default:
	return false;
	}
	}

	/// Given a scalar cast operation that is extracted from a vector, try to
	/// vectorize the cast op followed by extraction. This will avoid an expensive
	/// round-trip between XMM and GPR.
	static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: This could be enhanced to handle smaller integer types by peeking
	// through an extend.
	SDValue Extract = Cast.getOperand(0);
	MVT DestVT = Cast.getSimpleValueType();
	if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Extract.getOperand(1)))
	return SDValue();

	// See if we have a 128-bit vector cast op for this type of cast.
	SDValue VecOp = Extract.getOperand(0);
	MVT FromVT = VecOp.getSimpleValueType();
	unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
	MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
	MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
	if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
	return SDValue();

	// If we are extracting from a non-zero element, first shuffle the source
	// vector to allow extracting from element zero.
	SDLoc DL(Cast);
	if (!isNullConstant(Extract.getOperand(1))) {
	SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
	Mask[0] = Extract.getConstantOperandVal(1);
	VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
	}
	// If the source vector is wider than 128-bits, extract the low part. Do not
	// create an unnecessarily wide vector cast op.
	if (FromVT != Vec128VT)
	VecOp = extract128BitVector(VecOp, 0, DAG, DL);

	// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
	// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
	SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
	return Op;
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())
	return Op;

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	SDValue ValueToStore = Op.getOperand(0);
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
	!Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getSizeInBits()/8;
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Chain = DAG.getStore(
	DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
	}

	SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const {
	// Build the FILD
	SDLoc DL(Op);
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
	if (useSSE)
	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
	else
	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

	unsigned ByteSize = SrcVT.getSizeInBits() / 8;

	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
	MachineMemOperand *LoadMMO;
	if (FI) {
	int SSFI = FI->getIndex();
	LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, ByteSize, ByteSize);
	} else {
	LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
	StackSlot = StackSlot.getOperand(1);
	}
	SDValue FILDOps[] = {Chain, StackSlot};
	SDValue Result =
	DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
	Tys, FILDOps, SrcVT, LoadMMO);

	if (useSSE) {
	Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// FIXME: Currently the FST is glued to the FILD_FLAG. This
	// shouldn't be necessary except that RFP cannot be live across
	// multiple blocks. When stackifier is fixed, they can be uncoupled.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = Op.getValueSizeInBits() / 8;
	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
	MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, SSFISize);

	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
	Op.getValueType(), StoreMMO);
	Result = DAG.getLoad(
	Op.getValueType(), DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	}

	return Result;
	}

	/// 64-bit unsigned integer to double expansion.
	static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

	// Load the 64-bit value into an XMM register.
	SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Op.getOperand(0));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (Subtarget.hasSSE3()) {
	// FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
	}

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	}

	/// 32-bit unsigned integer to float expansion.
	static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
	Op.getOperand(0));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Load),
	DAG.getIntPtrConstant(0, dl));

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	SDValue N0 = Op.getOperand(0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));

	if (Subtarget.hasAVX512())
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

	// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
	// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
	SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
	SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

	// Two to the power of half-word-size.
	SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);

	// Clear upper part of LO, lower HI.
	SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
	SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

	SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
	fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
	SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

	// Add the two halves.
	return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	// We shouldn't use it when unsafe-fp-math is enabled though: we might later
	// reassociate the two FADDs, and if we do that, the algorithm fails
	// spectacularly (PR24512).
	// FIXME: If we ever have some kind of Machine FMF, this should be marked
	// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
	// there's also the MachineCombiner reassociations happening on Machine IR.
	if (DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDLoc DL(Op);
	SDValue V = Op->getOperand(0);
	MVT VecIntVT = V.getSimpleValueType();
	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFAdd = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue FHigh =
	DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
	// return (float4) lo + fhi;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = Op.getOperand(0);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	assert(!Subtarget.hasAVX512());
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (Op.getSimpleValueType().isVector())
	return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	MVT SrcVT = N0.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
	StackSlot, MachinePointerInfo());
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MachinePointerInfo());
	SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
	return Fild;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Op.getOperand(0);
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo());
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, 8, 8);

	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot };
	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
	MVT::i64, MMO);

	APInt FF(32, 0x5F800000ULL);

	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	// FIXME: Avoid the extend by constructing the right constant pool?
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	/* Alignment = */ 4);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an SDValue().
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence and return the
	// result.
	SDValue
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned) const {
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	EVT TheVT = Op.getOperand(0).getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return SDValue();
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

	if (!IsSigned && DstTy != MVT::i64) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getStoreSize();
	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	SDValue Chain = DAG.getEntryNode();
	SDValue Value = Op.getOperand(0);
	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	SDValue Cmp = DAG.getSetCC(DL,
	getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
	DAG.getConstant(0, DL, MVT::i64),
	DAG.getConstant(APInt::getSignMask(64),
	DL, MVT::i64));
	SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
	Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
	}

	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
	SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
	SDValue Ops[] = { Chain, StackSlot };

	unsigned FLDSize = TheVT.getStoreSize();
	assert(FLDSize <= MemSize && "Stack slot not big enough");
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
	Chain = Value.getValue(1);
	}

	// Build the FP_TO_INT*_IN_MEM
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOStore, MemSize, MemSize);
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
	DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);

	SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

	// If we need an unsigned fixup, XOR the result with adjust.
	if (UnsignedFixup)
	Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

	return Res;
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert((Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ZERO_EXTEND) &&
	"Unexpected extension opcode");
	assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);

	// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
	if (InVT == MVT::v8i8) {
	if (!ExperimentalVectorWideningLegalization \|\| VT != MVT::v8i64)
	return SDValue();

	In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
	MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
	return DAG.getNode(ExtendInVecOpc, dl, VT, In);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

	// Short-circuit if we can determine that each 128-bit half is the same value.
	// Otherwise, this is difficult to match and optimize.
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
	if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

	SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Opc == ISD::ZERO_EXTEND;
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	OpHi = DAG.getBitcast(HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
	static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
	const SDLoc &dl, SelectionDAG &DAG) {
	assert((VT == MVT::v16i8 \|\| VT == MVT::v16i16) && "Unexpected VT.");
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(0, dl));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(8, dl));
	Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
	Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
	// avoids a constant pool load.
	if (VT.getVectorElementType() != MVT::i8) {
	SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
	return DAG.getNode(ISD::SRL, DL, VT, Extend,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
	}

	// Extend VT if BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI()) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, DL));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
	NumElts);
	}

	SDValue One = DAG.getConstant(1, DL, WideVT);
	SDValue Zero = DAG.getConstant(0, DL, WideVT);

	SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

	// Truncate if we had to extend above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(MVT::i8, NumElts);
	SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
	DAG.getIntPtrConstant(0, DL));

	return SelectedVal;
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
	/// It makes use of the fact that vectors with enough leading sign/zero bits
	/// prevent the PACKSS/PACKUS from saturating the results.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
	/// within each 128-bit lane.
	static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Opcode == X86ISD::PACKSS \|\| Opcode == X86ISD::PACKUS) &&
	"Unexpected PACK opcode");
	assert(DstVT.isVector() && "VT not a vector?");

	// Requires SSE2 but AVX512 has fast vector truncate.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 64bits or greater from a
	// 128bits or greater source.
	unsigned DstSizeInBits = DstVT.getSizeInBits();
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	if ((DstSizeInBits % 64) != 0 \|\| (SrcSizeInBits % 128) != 0)
	return SDValue();

	unsigned NumElems = SrcVT.getVectorNumElements();
	if (!isPowerOf2_32(NumElems))
	return SDValue();

	LLVMContext &Ctx = *DAG.getContext();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");

	EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	EVT InVT = MVT::i16, OutVT = MVT::i8;
	if (SrcVT.getScalarSizeInBits() > 16 &&
	(Opcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41())) {
	InVT = MVT::i32;
	OutVT = MVT::i16;
	}

	// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
	if (SrcVT.is128BitVector()) {
	InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
	In = DAG.getBitcast(InVT, In);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
	Res = extractSubVector(Res, 0, DAG, DL, 64);
	return DAG.getBitcast(DstVT, Res);
	}

	// Extract lower/upper subvectors.
	unsigned NumSubElts = NumElems / 2;
	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

	unsigned SubSizeInBits = SrcSizeInBits / 2;
	InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

	// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

	// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
	SmallVector<int, 64> Mask;
	int Scale = 64 / OutVT.getScalarSizeInBits();
	scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
	Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
	Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	In = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	In = DAG.getBitcast(InVT, In);
	}
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
	In, ISD::SETGT);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	assert((NumElts == 8 \|\| NumElts == 16) && "Unexpected number of elements");
	// We need to change to a wider element type that we have support for.
	// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
	// For 16 element vectors we extend to v16i32 unless we are explicitly
	// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
	// we need to split into two 8 element vectors which we can extend to v8i32,
	// truncate and concat the results. There's an additional complication if
	// the original type is v16i8. In that case we can't split the v16i8 so
	// first we pre-extend it to v16i16 which we can split to v8i16, then extend
	// to v8i32, truncate that to v8i1 and concat the two halves.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
	if (InVT == MVT::v16i8) {
	// First we need to sign extend up to 256-bits so we can split that.
	InVT = MVT::v16i16;
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
	}
	SDValue Lo = extract128BitVector(In, 0, DAG, DL);
	SDValue Hi = extract128BitVector(In, 8, DAG, DL);
	// We're split now, just emit two truncates and a concat. The two
	// truncates will trigger legalization to come back to this function.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
	Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}
	// We either have 8 elements or we're allowed to use 512-bit vectors.
	// If we have VLX, we want to use the narrowest vector that can get the
	// job done so we use vXi32.
	MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
	MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	In = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	}
	// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
	if (Subtarget.hasDQI())
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
	return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	unsigned InNumEltBits = InVT.getScalarSizeInBits();

	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	// If called by the legalizer just return.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	// word to byte only under BWI. Otherwise we have to promoted to v16i32
	// and then truncate that. But we should only do that if we haven't been
	// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
	// handled by isel patterns.
	if (InVT != MVT::v16i16 \|\| Subtarget.hasBWI() \|\|
	Subtarget.canExtendTo512DQ())
	return Op;
	}

	unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Truncate with PACKUS if we are truncating a vector with leading zero bits
	// that extend all the way to the packed/truncated value.
	// Pre-SSE41 we can only use PACKUSWB.
	KnownBits Known = DAG.computeKnownBits(In);
	if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
	return V;

	// Truncate with PACKSS if we are truncating a vector with sign-bits that
	// extend all the way to the packed/truncated value.
	if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
	return V;

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
	// Use an AND to zero uppper bits for PACKUS.
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

	SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(8, DL));
	return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
	}

	// Handle truncation of V256 to V128 using shuffles.
	assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");

	assert(Subtarget.hasAVX() && "256-bit vector without AVX!");

	unsigned NumElems = VT.getVectorNumElements();
	MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

	SmallVector<int, 16> MaskVec(NumElems * 2, -1);
	// Prepare truncation shuffle mask
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = i * 2;
	In = DAG.getBitcast(NVT, In);
	SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
	MVT VT = Op.getSimpleValueType();
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.isVector()) {
	if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	MVT TruncVT = MVT::v4i1;
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	TruncVT = MVT::v8i1;
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
	DAG.getUNDEF(MVT::v8f64),
	Src, DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
	Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
	return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32)));
	}

	return SDValue();
	}

	assert(!VT.isVector());

	bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

	if (!IsSigned && Subtarget.hasAVX512()) {
	// Conversions from f32/f64 should be legal.
	if (UseSSEReg)
	return Op;

	// Use default expansion.
	if (VT == MVT::i64)
	return SDValue();
	}

	// Promote i16 to i32 if we can use a SSE operation.
	if (VT == MVT::i16 && UseSSEReg) {
	assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
	SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	// If this is a SINT_TO_FP using SSEReg we're done.
	if (UseSSEReg && IsSigned)
	return Op;

	// Fall back to X87.
	if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
	return V;

	llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
	}

	static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	return DAG.getNode(X86ISD::VFPEXT, DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
	In, DAG.getUNDEF(SVT)));
	}

	/// Horizontal vector math instructions may be slower than normal math with
	/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
	/// implementation, and likely shuffle complexity of the alternate sequence.
	static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool HasFastHOps = Subtarget.hasFastHorizontalOps();
	return !IsSingleSource \|\| IsOptimizingSize \|\| HasFastHOps;
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If both operands have other uses, this is probably not profitable.
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (!LHS.hasOneUse() && !RHS.hasOneUse())
	return Op;

	// FP horizontal add/sub were added with SSE3. Integer with SSSE3.
	bool IsFP = Op.getSimpleValueType().isFloatingPoint();
	if (IsFP && !Subtarget.hasSSE3())
	return Op;
	if (!IsFP && !Subtarget.hasSSSE3())
	return Op;

	// Extract from a common vector.
	if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	LHS.getOperand(0) != RHS.getOperand(0) \|\|
	!isa<ConstantSDNode>(LHS.getOperand(1)) \|\|
	!isa<ConstantSDNode>(RHS.getOperand(1)) \|\|
	!shouldUseHorizontalOp(true, DAG, Subtarget))
	return Op;

	// Allow commuted 'hadd' ops.
	// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
	unsigned HOpcode;
	switch (Op.getOpcode()) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default:
	llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
	}
	unsigned LExtIndex = LHS.getConstantOperandVal(1);
	unsigned RExtIndex = RHS.getConstantOperandVal(1);
	if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
	(HOpcode == X86ISD::HADD \|\| HOpcode == X86ISD::FHADD))
	std::swap(LExtIndex, RExtIndex);

	if ((LExtIndex & 1) != 0 \|\| RExtIndex != (LExtIndex + 1))
	return Op;

	SDValue X = LHS.getOperand(0);
	EVT VecVT = X.getValueType();
	unsigned BitWidth = VecVT.getSizeInBits();
	unsigned NumLanes = BitWidth / 128;
	unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
	assert((BitWidth == 128 \|\| BitWidth == 256 \|\| BitWidth == 512) &&
	"Not expecting illegal vector widths here");

	// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
	// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
	SDLoc DL(Op);
	if (BitWidth == 256 \|\| BitWidth == 512) {
	unsigned LaneIdx = LExtIndex / NumEltsPerLane;
	X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
	LExtIndex %= NumEltsPerLane;
	}

	// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
	// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
	SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
	DAG.getIntPtrConstant(LExtIndex / 2, DL));
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Op.getValueType() == MVT::f32 \|\| Op.getValueType() == MVT::f64) &&
	"Only expecting float/double");
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFABSorFNEG");

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	unsigned EltBits = VT.getScalarSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
	APInt::getSignMask(EltBits);
	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp = IsFABS ? X86ISD::FAND :
	IsFNABS ? X86ISD::FOR :
	X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
	/// style scalarized (associative) reduction patterns.
	static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
	SmallVectorImpl<SDValue> &SrcOps) {
	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, APInt> SrcOpMap;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	assert(Op.getOpcode() == unsigned(BinOp) &&
	"Unexpected bit reduction opcode");
	Opnds.push_back(Op.getOperand(0));
	Opnds.push_back(Op.getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all BinOp operands.
	if (I->getOpcode() == unsigned(BinOp)) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return false;

	// Quit if without a constant index.
	SDValue Idx = I->getOperand(1);
	if (!isa<ConstantSDNode>(Idx))
	return false;

	SDValue Src = I->getOperand(0);
	DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
	if (M == SrcOpMap.end()) {
	VT = Src.getValueType();
	// Quit if not the same type.
	if (SrcOpMap.begin() != SrcOpMap.end() &&
	VT != SrcOpMap.begin()->first.getValueType())
	return false;
	unsigned NumElts = VT.getVectorNumElements();
	APInt EltCount = APInt::getNullValue(NumElts);
	M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
	SrcOps.push_back(Src);
	}
	// Quit if element already used.
	unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (M->second[CIdx])
	return false;
	M->second.setBit(CIdx);
	}

	// Quit if not all elements are used.
	for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
	E = SrcOpMap.end();
	I != E; ++I) {
	if (!I->second.isAllOnesValue())
	return false;
	}

	return true;
	}

	// Check whether an OR'd tree is PTEST-able.
	static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &X86CC) {
	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

	if (!Subtarget.hasSSE41() \|\| !Op->hasOneUse())
	return SDValue();

	SmallVector<SDValue, 8> VecIns;
	if (!matchBitOpReduction(Op, ISD::OR, VecIns))
	return SDValue();

	// Quit if not 128/256-bit vector.
	EVT VT = VecIns[0].getValueType();
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	SDLoc DL(Op);
	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

	// Cast all vectors into TestVT for PTEST.
	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is only
	// 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
	}

	X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
	MVT::i8);
	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
	}

	/// return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	SDValue ArithOp = Op;

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better.
	if (!hasNonFlagsUse(Op))
	break;

	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	// Transform to an x86-specific ALU node with flags if there is a chance of
	// using an RMW op or only the flags are used. Otherwise, leave
	// the node alone and emit a 'test' instruction.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::ADD: Opcode = X86ISD::ADD; break;
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: Opcode = X86ISD::OR; break;
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	default:
	default_case:
	break;
	}

	if (Opcode == 0) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

	EVT CmpVT = Op0.getValueType();

	if (CmpVT.isFloatingPoint())
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);

	assert((CmpVT == MVT::i8 \|\| CmpVT == MVT::i16 \|\|
	CmpVT == MVT::i32 \|\| CmpVT == MVT::i64) && "Unexpected VT!");

	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
	!DAG.getMachineFunction().getFunction().hasMinSize()) {
	ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
	ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
	// Don't do this if the immediate can fit in 8-bits.
	if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) \|\|
	(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
	unsigned ExtendOp =
	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	if (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE) {
	// For equality comparisons try to use SIGN_EXTEND if the input was
	// truncate from something with enough sign bits.
	if (Op0.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op0.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	} else if (Op1.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op1.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	}
	}

	CmpVT = MVT::i32;
	Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
	}
	}
	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
	return Sub.getValue(1);
	}

	/// Convert a comparison if required by the subtarget.
	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
	SelectionDAG &DAG) const {
	// If the subtarget does not support the FUCOMI instruction, floating-point
	// comparisons have to be converted.
	if (Subtarget.hasCMov() \|\|
	Cmp.getOpcode() != X86ISD::CMP \|\|
	!Cmp.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Cmp.getOperand(1).getValueType().isFloatingPoint())
	return Cmp;

	// The instruction selector will select an FUCOM instruction instead of
	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
	SDLoc dl(Cmp);
	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
	DAG.getConstant(8, dl, MVT::i8));
	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
	// after legalize types.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	// There is no FSQRT for 512-bits, but there is RSQRT14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	// There is no FSQRT for 512-bits, but there is RCP14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	/// Returns the BT node and the condition code needed to use it.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	SDValue &X86CC) {
	assert(And.getOpcode() == ISD::AND && "Expected AND node!");
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue Src, BitNo;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known = DAG.computeKnownBits(Op0);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	Src = Op1;
	BitNo = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	Src = AndLHS.getOperand(0);
	BitNo = AndLHS.getOperand(1);
	} else {
	// Use BT if the immediate can't be encoded in a TEST instruction or we
	// are optimizing for size and the immedaite won't fit in a byte.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if ((!isUInt<32>(AndRHSVal) \|\| (OptForSize && !isUInt<8>(AndRHSVal))) &&
	isPowerOf2_64(AndRHSVal)) {
	Src = AndLHS;
	BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
	Src.getValueType());
	}
	}
	}

	// No patterns found, give up.
	if (!Src.getNode())
	return SDValue();

	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
	dl, MVT::i8);
	return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ: SSECC = 8; break;
	case ISD::SETONE: SSECC = 12; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

	// If this is a seteq make sure any build vectors of all zeros are on the RHS.
	// This helps with vptestm matching.
	// TODO: Should we just canonicalize the setcc during DAG combine?
	if ((SetCCOpcode == ISD::SETEQ \|\| SetCCOpcode == ISD::SETNE) &&
	ISD::isBuildVectorAllZeros(Op0.getNode()))
	std::swap(Op0, Op1);

	// Prefer SETGT over SETLT.
	if (SetCCOpcode == ISD::SETLT) {
	SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
	std::swap(Op0, Op1);
	}

	return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
	}

	/// Given a buildvector constant, return a new vector constant with each element
	/// incremented or decremented. If incrementing or decrementing would result in
	/// unsigned overflow or underflow or this is not a simple vector constant,
	/// return an empty value.
	static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
	auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
	if (!BV)
	return SDValue();

	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<SDValue, 8> NewVecC;
	SDLoc DL(V);
	for (unsigned i = 0; i < NumElts; ++i) {
	auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EltVT)
	return SDValue();

	// Avoid overflow/underflow.
	const APInt &EltC = Elt->getAPIntValue();
	if ((IsInc && EltC.isMaxValue()) \|\| (!IsInc && EltC.isNullValue()))
	return SDValue();

	NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
	}

	return DAG.getBuildVector(VT, DL, NewVecC);
	}

	/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	/// Op0 u<= Op1:
	/// t = psubus Op0, Op1
	/// pcmpeq t, <0..0>
	static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
	ISD::CondCode Cond, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	MVT VET = VT.getVectorElementType();
	if (VET != MVT::i8 && VET != MVT::i16)
	return SDValue();

	switch (Cond) {
	default:
	return SDValue();
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	return SDValue();
	SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
	if (!ULEOp1)
	return SDValue();
	Op1 = ULEOp1;
	break;
	}
	case ISD::SETUGT: {
	// If the comparison is against a constant, we can turn this into a setuge.
	// This is beneficial because materializing a constant 0 for the PCMPEQ is
	// probably cheaper than XOR+PCMPGT using 2 different vector constants:
	// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
	SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
	if (!UGEOp1)
	return SDValue();
	Op1 = Op0;
	Op0 = UGEOp1;
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE:
	std::swap(Op0, Op1);
	break;
	case ISD::SETULE:
	break;
	}

	SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
	return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	DAG.getConstant(0, dl, VT));
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = X86ISD::CMPM;
	} else {
	Opc = X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	SDValue Cmp;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
	if (SSECC >= 8 && !Subtarget.hasAVX()) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = X86ISD::FOR;
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = X86ISD::FAND;
	}

	SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC0, dl, MVT::i8));
	SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC1, dl, MVT::i8));
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	// Handle all other FP comparisons here.
	Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
	// result type of SETCC. The bitcast is expected to be optimized away
	// during combining/isel.
	if (Opc == X86ISD::CMPP)
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

	return Cmp;
	}

	MVT VTOp0 = Op0.getSimpleValueType();
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	// This is being called by type legalization because v2i32 is marked custom
	// for result type legalization for v2f32.
	if (VTOp0 == MVT::v2i32)
	return SDValue();

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	assert((VTOp0.getScalarSizeInBits() >= 32 \|\| Subtarget.hasBWI()) &&
	"Unexpected operand type");
	return LowerIntVSETCC_AVX512(Op, DAG);
	}

	// Lower using XOP integer comparisons.
	if (VT.is128BitVector() && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CmpMode, dl, MVT::i8));
	}

	// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
	// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
	if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
	SDValue BC0 = peekThroughBitcasts(Op0);
	if (BC0.getOpcode() == ISD::AND) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(BC0.getOperand(1),
	VT.getScalarSizeInBits(), UndefElts,
	EltBits, false, false)) {
	if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
	Cond = ISD::SETEQ;
	Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
	}
	}
	}
	}

	// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
	if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
	Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
	ConstantSDNode *C1 = isConstOrConstSplat(Op1);
	if (C1 && C1->getAPIntValue().isPowerOf2()) {
	unsigned BitWidth = VT.getScalarSizeInBits();
	unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

	SDValue Result = Op0.getOperand(0);
	Result = DAG.getNode(ISD::SHL, dl, VT, Result,
	DAG.getConstant(ShiftAmt, dl, VT));
	Result = DAG.getNode(ISD::SRA, dl, VT, Result,
	DAG.getConstant(BitWidth - 1, dl, VT));
	return Result;
	}
	}

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntVSETCC(Op, DAG);

	// If this is a SETNE against the signed minimum value, change it to SETGT.
	// If this is a SETNE against the signed maximum value, change it to SETLT.
	// which will be swapped to SETGT.
	// Otherwise we use PCMPEQ+invert.
	APInt ConstValue;
	if (Cond == ISD::SETNE &&
	ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
	if (ConstValue.isMinSignedValue())
	Cond = ISD::SETGT;
	else if (ConstValue.isMaxSignedValue())
	Cond = ISD::SETLT;
	}

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for unsigned compares.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (ISD::isUnsignedIntSetCC(Cond) &&
	(FlipSigns \|\| ISD::isTrueWhenEqual(Cond)) &&
	TLI.isOperationLegal(ISD::UMIN, VT)) {
	// If we have a constant operand, increment/decrement it and change the
	// condition to avoid an invert.
	if (Cond == ISD::SETUGT &&
	ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
	return !C->getAPIntValue().isMaxValue();
	})) {
	// X > C --> X >= (C+1) --> X == umax(X, C+1)
	Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
	Cond = ISD::SETUGE;
	}
	if (Cond == ISD::SETULT &&
	ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
	return !C->getAPIntValue().isNullValue();
	})) {
	// X < C --> X <= (C-1) --> X == umin(X, C-1)
	Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
	Cond = ISD::SETULE;
	}
	bool Invert = false;
	unsigned Opc;
	switch (Cond) {
	default: llvm_unreachable("Unexpected condition code");
	case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETULE: Opc = ISD::UMIN; break;
	case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: Opc = ISD::UMAX; break;
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to use SUBUS and PCMPEQ.
	if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
	return V;

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
	} else {
	SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

	// Cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to select this as a KORTEST+SETCC if possible.
	static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue &X86CC) {
	// Only support equality comparisons.
	if (CC != ISD::SETEQ && CC != ISD::SETNE)
	return SDValue();

	// Must be a bitcast from vXi1.
	if (Op0.getOpcode() != ISD::BITCAST)
	return SDValue();

	Op0 = Op0.getOperand(0);
	MVT VT = Op0.getSimpleValueType();
	if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
	!(Subtarget.hasDQI() && VT == MVT::v8i1) &&
	!(Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1)))
	return SDValue();

	X86::CondCode X86Cond;
	if (isNullConstant(Op1)) {
	X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	} else if (isAllOnesConstant(Op1)) {
	// C flag is set for all ones.
	X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
	} else
	return SDValue();

	// If the input is an OR, we can combine it's operands into the KORTEST.
	SDValue LHS = Op0;
	SDValue RHS = Op0;
	if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
	LHS = Op0.getOperand(0);
	RHS = Op0.getOperand(1);
	}

	X86CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	}

	/// Emit flags for the given setcc condition and operands. Also returns the
	/// corresponding X86 condition code constant in X86CC.
	SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
	ISD::CondCode CC, const SDLoc &dl,
	SelectionDAG &DAG,
	SDValue &X86CC) const {
	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
	return BT;
	}

	// Try to use PTEST for a tree ORs equality compared with 0.
	// TODO: We could do AND tree with all 1s as well by using the C flag.
	if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
	return PTEST;
	}

	// Try to lower using KORTEST.
	if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
	return KORTEST;

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

	X86CC = Op0.getOperand(0);
	if (Invert) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	X86CC = DAG.getConstant(CCode, dl, MVT::i8);
	}

	return Op0.getOperand(1);
	}
	}

	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
	X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
	if (CondCode == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
	X86CC = DAG.getConstant(CondCode, dl, MVT::i8);
	return EFLAGS;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDLoc dl(Op);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

	SDValue X86CC;
	SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
	if (!EFLAGS)
	return SDValue();

	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	return getSETCC(CC, Cmp.getValue(1), DL, DAG);
	}

	// This function returns three things: the arithmetic computation itself
	// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
	// flag and the condition code define the case in which the arithmetic
	// computation overflows.
	static std::pair<SDValue, SDValue>
	getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
	assert(Op.getResNo() == 0 && "Unexpected result number!");
	SDValue Value, Overflow;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned BaseOp = 0;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
	break;
	case ISD::SSUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO:
	BaseOp = X86ISD::UMUL;
	Cond = X86::COND_O;
	break;
	}

	if (BaseOp) {
	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}

	return std::make_pair(Value, Overflow);
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDLoc DL(Op);
	X86::CondCode Cond;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

	SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
	assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::SAHF)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\| Opc == X86ISD::UMUL \|\|
	Opc == X86ISD::OR \|\| Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC &&
	((Subtarget.hasSSE2() && VT == MVT::f64) \|\|
	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	unsigned SSECC = translateX86FSETCC(
	cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

	if (Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
	CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
	assert(!VT.isVector() && "Not a scalar type?");
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (SSECC < 8 \|\| Subtarget.hasAVX()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a +0.0 constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.
	if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
	!isNullFPConstant(Op2)) {
	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.
	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	// For v64i1 without 64-bit support we need to split and rejoin.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	assert(Subtarget.hasBWI() && "Expected BWI to be legal");
	SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
	SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
	SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
	SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
	SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
	SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
	SDValue Op1Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
	Op1Scalar = Op1.getOperand(0);
	SDValue Op2Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
	Op2Scalar = Op2.getOperand(0);
	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
	Op1Scalar, Op2Scalar);
	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, newSelect);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	unsigned CondCode =
	cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
	SDValue CmpOp0 = Cmp.getOperand(0);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	Zero = DAG.getConstant(0, DL, Op.getValueType());
	return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
	}

	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	if (!isNullConstant(Op2))
	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	return Res;
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue CmpOp0 = Cmp.getOperand(0);
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	MVT VT = Op.getSimpleValueType();

	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT)) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Cmp.getOpcode() == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) {
	SDValue Value;
	X86::CondCode X86Cond;
	std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	CC = DAG.getConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	SDValue BTCC;
	if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
	CC = BTCC;
	Cond = BT;
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
	X86::COND_NE, DL, DAG);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Blacklist CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
	CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// Or finally, promote i8 cmovs if we have CMOV,
	// or i16 cmovs if it won't prevent folding a load.
	// FIXME: we should not limit promotion of i8 case to only when the CMOV is
	// legal, but EmitLoweredSelect() can not deal with these extensions
	// being inserted between two CMOV's. (in i16 case too TBN)
	// https://bugs.llvm.org/show_bug.cgi?id=40974
	if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) \|\|
	(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
	!MayFoldLoad(Op2))) {
	Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
	Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
	}

	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	MVT VTElt = VT.getVectorElementType();
	SDLoc dl(Op);

	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is i8/i16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, dl));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
	}

	SDValue V;
	MVT WideEltVT = WideVT.getVectorElementType();
	if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) \|\|
	(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
	V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
	} else {
	SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
	SDValue Zero = DAG.getConstant(0, dl, WideVT);
	V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
	}

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VTElt, NumElts);
	V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
	DAG.getIntPtrConstant(0, dl));

	return V;
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasAVX()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (InVT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * NumElts;
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	InVT = In.getSimpleValueType();
	}

	// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");

	if (InVT.getVectorNumElements() != NumElts)
	return DAG.getNode(Op.getOpcode(), dl, VT, In);

	// FIXME: Apparently we create inreg operations that could be regular
	// extends.
	unsigned ExtOpc =
	Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
	: ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
	if (Subtarget.hasAVX()) {
	assert(VT.is256BitVector() && "256-bit vector expected");
	int HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);

	unsigned NumSrcElts = InVT.getVectorNumElements();
	SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
	for (int i = 0; i != HalfNumElts; ++i)
	HiMask[i] = HalfNumElts + i;

	SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
	Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	}

	// We should only get here for sign extend.
	assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
	assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	SDValue SignExt = Curr;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	if (InVT != MVT::v4i32) {
	MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

	unsigned DestWidth = DestVT.getScalarSizeInBits();
	unsigned Scale = DestWidth / InSVT.getSizeInBits();

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned DestElts = DestVT.getVectorNumElements();

	// Build a shuffle mask that takes each input element and places it in the
	// MSBs of the new element size.
	SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != DestElts; ++i)
	Mask[i * Scale + (Scale - 1)] = i;

	Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
	Curr = DAG.getBitcast(DestVT, Curr);

	unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
	DAG.getConstant(SignExtShift, dl, MVT::i8));
	}

	if (VT == MVT::v2i64) {
	assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
	SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
	SignExt = DAG.getBitcast(VT, SignExt);
	}

	return SignExt;
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
	if (InVT == MVT::v8i8) {
	if (!ExperimentalVectorWideningLegalization \|\| VT != MVT::v8i64)
	return SDValue();

	In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
	MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
	return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

	unsigned NumElems = InVT.getVectorNumElements();
	SmallVector<int,8> ShufMask(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	/// Change a vector store into a pair of half-size vector stores.
	static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert((StoredVal.getValueType().is256BitVector() \|\|
	StoredVal.getValueType().is512BitVector()) &&
	"Expecting 256/512-bit op");

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. We are assuming the input op is legal (this transform
	// is only used for targets with AVX).
	if (Store->isVolatile())
	return SDValue();

	MVT StoreVT = StoredVal.getSimpleValueType();
	unsigned NumElems = StoreVT.getVectorNumElements();
	unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
	unsigned HalfAlign = (128 == HalfSize ? 16 : 32);

	SDLoc DL(Store);
	SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
	SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
	SDValue Ptr0 = Store->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
	unsigned Alignment = Store->getAlignment();
	SDValue Ch0 =
	DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
	Alignment, Store->getMemOperand()->getFlags());
	SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
	Store->getPointerInfo().getWithOffset(HalfAlign),
	MinAlign(Alignment, HalfAlign),
	Store->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
	}

	/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
	/// type.
	static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
	SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert(StoreVT.is128BitVector() &&
	StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
	StoredVal = DAG.getBitcast(StoreVT, StoredVal);

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. We are assuming the input op is legal (this transform
	// is only used for targets with AVX).
	if (Store->isVolatile())
	return SDValue();

	MVT StoreSVT = StoreVT.getScalarType();
	unsigned NumElems = StoreVT.getVectorNumElements();
	unsigned ScalarSize = StoreSVT.getStoreSize();
	unsigned Alignment = Store->getAlignment();

	SDLoc DL(Store);
	SmallVector<SDValue, 4> Stores;
	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Offset = i * ScalarSize;
	SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
	SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
	DAG.getIntPtrConstant(i, DL));
	SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
	Store->getPointerInfo().getWithOffset(Offset),
	MinAlign(Alignment, Offset),
	Store->getMemOperand()->getFlags());
	Stores.push_back(Ch);
	}
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
	}

	static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
	SDLoc dl(St);
	SDValue StoredVal = St->getValue();

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
	if (StoredVal.getValueType().isVector() &&
	StoredVal.getValueType().getVectorElementType() == MVT::i1) {
	assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
	"Unexpected VT");
	assert(!St->isTruncatingStore() && "Expected non-truncating store");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getUNDEF(MVT::v16i1), StoredVal,
	DAG.getIntPtrConstant(0, dl));
	StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
	StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	if (St->isTruncatingStore())
	return SDValue();

	// If this is a 256-bit store of concatenated ops, we are better off splitting
	// that store into two 128-bit stores. This avoids spurious use of 256-bit ops
	// and each half can execute independently. Some cores would split the op into
	// halves anyway, so the concat (vinsertf128) is purely an extra op.
	MVT StoreVT = StoredVal.getSimpleValueType();
	if (StoreVT.is256BitVector()) {
	SmallVector<SDValue, 4> CatOps;
	if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
	return splitVectorStore(St, DAG);
	return SDValue();
	}

	assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
	"Unexpected VT");
	if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
	TargetLowering::TypeWidenVector)
	return SDValue();

	MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
	StoreVT.getVectorNumElements() * 2);
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
	DAG.getUNDEF(StoreVT));

	if (Subtarget.hasSSE2()) {
	// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
	// and store it.
	MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
	MVT CastVT = MVT::getVectorVT(StVT, 2);
	StoredVal = DAG.getBitcast(CastVT, StoredVal);
	StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
	DAG.getIntPtrConstant(0, dl));

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
	return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
	St->getMemOperand());
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector loads.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
	if (RegVT.getVectorElementType() == MVT::i1) {
	assert(EVT(RegVT) == MemVT && "Expected non-extending load");
	assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());

	// Replace chain users with the new chain.
	assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");

	SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
	Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
	DAG.getBitcast(MVT::v16i1, Val),
	DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
	}

	// Nothing useful we can do without SSE2 shuffles.
	assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned RegSz = RegVT.getSizeInBits();

	ISD::LoadExtType Ext = Ld->getExtensionType();

	assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD)
	&& "Only anyext and sext are currently implemented.");
	assert(MemVT != RegVT && "Cannot extend to the same type");
	assert(MemVT.isVector() && "Must load a vector from memory");

	unsigned NumElems = RegVT.getVectorNumElements();
	unsigned MemSz = MemVT.getSizeInBits();
	assert(RegSz > MemSz && "Register size must be greater than the mem size");

	if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
	// The only way in which we have a legal 256-bit vector result but not the
	// integer 256-bit operations needed to directly lower a sextload is if we
	// have AVX1 but not AVX2. In that case, we can always emit a sextload to
	// a 128-bit vector and a normal sign_extend to 256-bits that should get
	// correctly legalized. We do this late to allow the canonical form of
	// sextload to persist throughout the rest of the DAG combiner -- it wants
	// to fold together any extensions it can, and so will fuse a sign_extend
	// of an sextload into a sextload targeting a wider value.
	SDValue Load;
	if (MemSz == 128) {
	// Just switch this to a normal load.
	assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
	"it must be a legal 128-bit vector "
	"type!");
	Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	} else {
	assert(MemSz < 128 &&
	"Can't extend a type wider than 128 bits to a 256 bit vector!");
	// Do an sext load to a 128-bit vector type. We want to use the same
	// number of elements, but elements half as wide. This will end up being
	// recursively lowered by this routine, but will succeed as we definitely
	// have all the necessary features if we're using AVX1.
	EVT HalfEltVT =
	EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
	EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
	Load =
	DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	}

	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");

	// Finally, do a normal sign-extend to the desired register.
	SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
	return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
	}

	// All sizes must be a power of two.
	assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
	"Non-power-of-two elements are not custom lowered!");

	// Attempt to load the original value using scalar loads.
	// Find the largest scalar type that divides the total loaded size.
	MVT SclrLoadTy = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
	SclrLoadTy = Tp;
	}
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
	(64 <= MemSz))
	SclrLoadTy = MVT::f64;

	// Calculate the number of scalar loads that we need to perform
	// in order to load our vector from memory.
	unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

	assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&
	"Can only lower sext loads with a single scalar load!");

	unsigned loadRegSize = RegSz;
	if (Ext == ISD::SEXTLOAD && RegSz >= 256)
	loadRegSize = 128;

	// If we don't have BWI we won't be able to create the shuffle needed for
	// v8i8->v8i64.
	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8)
	loadRegSize = 128;

	// Represent our vector as a sequence of elements which are the
	// largest scalar that we can load.
	EVT LoadUnitVecVT = EVT::getVectorVT(
	*DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());

	// Represent the data using the same element type that is stored in
	// memory. In practice, we ''widen'' MemVT.
	EVT WideVecVT =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	loadRegSize / MemVT.getScalarSizeInBits());

	assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
	"Invalid vector type");

	// We can't shuffle using an illegal type.
	assert(TLI.isTypeLegal(WideVecVT) &&
	"We only lower types that form legal widened vector types");

	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = Ld->getBasePtr();
	unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
	SDValue Increment = DAG.getConstant(OffsetInc, dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

	unsigned Offset = 0;
	for (unsigned i = 0; i < NumLoads; ++i) {
	unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);

	// Perform a single load.
	SDValue ScalarLoad =
	DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
	Ld->getPointerInfo().getWithOffset(Offset),
	NewAlign, Ld->getMemOperand()->getFlags());
	Chains.push_back(ScalarLoad.getValue(1));
	// Create the first element type using SCALAR_TO_VECTOR in order to avoid
	// another round of DAGCombining.
	if (i == 0)
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
	else
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
	ScalarLoad, DAG.getIntPtrConstant(i, dl));

	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
	Offset += OffsetInc;
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

	// Bitcast the loaded value to a vector of the original element type, in
	// the size of the target vector type.
	SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
	unsigned SizeRatio = RegSz / MemSz;

	if (Ext == ISD::SEXTLOAD) {
	SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
	return DAG.getMergeValues({Sext, TF}, dl);
	}

	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8) {
	SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
	return DAG.getMergeValues({Sext, TF}, dl);
	}

	// Redistribute the loaded elements into the different locations.
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);

	// Bitcast to the requested type.
	Shuff = DAG.getBitcast(RegVT, Shuff);
	return DAG.getMergeValues({Shuff, TF}, dl);
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
	/// SETCC node has a single use.
	static bool isXor1OfSetCC(SDValue Op) {
	if (Op.getOpcode() != ISD::XOR)
	return false;
	if (isOneConstant(Op.getOperand(1)))
	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse();
	return false;
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	bool addTest = true;
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);
	SDValue CC;
	bool Inverted = false;

	if (Cond.getOpcode() == ISD::SETCC) {
	// Check for setcc([su]{add,sub,mul}o == 0).
	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	Cond.getOperand(0).getResNo() == 1 &&
	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
	Inverted = true;
	Cond = Cond.getOperand(0);
	} else {
	if (SDValue NewCond = LowerSETCC(Cond, DAG))
	Cond = NewCond;
	}
	}
	#if 0
	// FIXME: LowerXALUO doesn't handle these!!
	else if (Cond.getOpcode() == X86ISD::ADD \|\|
	Cond.getOpcode() == X86ISD::SUB \|\|
	Cond.getOpcode() == X86ISD::SMUL \|\|
	Cond.getOpcode() == X86ISD::UMUL)
	Cond = LowerXALUO(Cond, DAG);
	#endif

	// Look pass (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
	Cond = Cmp;
	addTest = false;
	} else {
	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
	default: break;
	case X86::COND_O:
	case X86::COND_B:
	// These can only come from an arithmetic instruction with overflow,
	// e.g. SADDO, UADDO.
	Cond = Cond.getOperand(1);
	addTest = false;
	break;
	}
	}
	}
	CondOpcode = Cond.getOpcode();
	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) {
	SDValue Value;
	X86::CondCode X86Cond;
	std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	if (Inverted)
	X86Cond = X86::GetOppositeBranchCondition(X86Cond);

	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	addTest = false;
	} else {
	unsigned CondOpc;
	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
	SDValue Cmp = Cond.getOperand(0).getOperand(1);
	if (CondOpc == ISD::OR) {
	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp)) {
	CC = Cond.getOperand(0).getOperand(0);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = Cond.getOperand(1).getOperand(0);
	Cond = Cmp;
	addTest = false;
	}
	} else { // ISD::AND
	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp) &&
	Op.getNode()->hasOneUse()) {
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	}
	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
	// It should be transformed during dag combiner except when the condition
	// is set by a arithmetics with overflow node.
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cond.getOperand(0).getOperand(1);
	addTest = false;
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}

	if (addTest) {
	// Look pass the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	SDValue BTCC;
	if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
	CC = BTCC;
	Cond = BT;
	addTest = false;
	}
	}
	}

	if (addTest) {
	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
	X86Cond, dl, DAG);
	}
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cond);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbe;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlign = TFI.getStackAlignment();
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Result = DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function &F = MF.getFunction();
	for (const auto &A : F.args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Align) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	if (ArgVT == MVT::f80) {
	llvm_unreachable("va_arg for f80 not yet implemented");
	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	} else {
	llvm_unreachable("Unhandled argument type in LowerVAARG");
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(
	X86ISD::VAARG_64, dl,
	VTs, InstOps, MVT::i64,
	MachinePointerInfo(SV),
	/Align=/0,
	MachineMemOperand::MOLoad \| MachineMemOperand::MOStore);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction().getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
	false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
	static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
	switch (Opc) {
	case ISD::SHL:
	case X86ISD::VSHL:
	case X86ISD::VSHLI:
	return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
	case ISD::SRL:
	case X86ISD::VSRL:
	case X86ISD::VSRLI:
	return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
	case ISD::SRA:
	case X86ISD::VSRA:
	case X86ISD::VSRAI:
	return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
	}
	llvm_unreachable("Unknown target vector shift node");
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();

	switch (Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version.
	Opc = getTargetVShiftUniformOpcode(Opc, true);

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +====================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +====================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| No \| byte-shift-in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +====================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	(ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 \|\|
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
	ShAmt = ShAmt.getOperand(0);
	MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
	if (Subtarget.hasSSE41())
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	else {
	SDValue ByteShift = DAG.getConstant(
	(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
	ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
	ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	}
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	} else {
	SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
	DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getConstant(0, dl, MaskVT);

	assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
	SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
	DAG.getBitcast(MVT::v8i1, Mask),
	DAG.getIntPtrConstant(0, dl));
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_SAE \|\|
	Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
	return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

	return false;
	};
	auto isRoundModeSAE = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
	return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;

	return false;
	};
	auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
	RC = C->getZExtValue();
	if (RC & X86::STATIC_ROUNDING::NO_EXC) {
	// Clear the NO_EXC bit and check remaining bits.
	RC ^= X86::STATIC_ROUNDING::NO_EXC;
	return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT \|\|
	RC == X86::STATIC_ROUNDING::TO_NEG_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_POS_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_ZERO;
	}
	}

	return false;
	};

	SDLoc dl(Op);
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP: {
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(2);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1),
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
	}
	case INTR_TYPE_1OP_SAE: {
	SDValue Sae = Op.getOperand(2);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
	}
	case INTR_TYPE_2OP: {
	SDValue Src2 = Op.getOperand(2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(3);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1), Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Src2);
	}
	case INTR_TYPE_2OP_SAE: {
	SDValue Sae = Op.getOperand(3);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	}
	case INTR_TYPE_3OP:
	case INTR_TYPE_3OP_IMM8: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	if (IntrData->Type == INTR_TYPE_3OP_IMM8)
	Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src1, Src2, Src3,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Src1, Src2, Src3);
	}
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RC Opcode is specified and
	// - RC is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getVectorMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, PassThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK_SAE: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Rnd = Op.getOperand(4);

	unsigned Opc;
	if (isRoundModeCurDirection(Rnd))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Rnd))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	bool HasRounding = IntrWithRoundingModeOpcode != 0;
	if (Op.getNumOperands() == (5U + HasRounding)) {
	if (HasRounding) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getScalarMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, passThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2),
	Mask, passThru, Subtarget, DAG);
	}

	assert(Op.getNumOperands() == (6U + HasRounding) &&
	"Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	unsigned Opc = IntrData->Opc0;
	if (HasRounding) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrWithRoundingModeOpcode;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
	Src2, RoundingMode),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RND: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Rnd = Op.getOperand(5);

	SDValue NewOp;
	unsigned RC = 0;
	if (isRoundModeCurDirection(Rnd))
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	else if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else
	return SDValue();

	return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Sae = Op.getOperand(5);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue NewOp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	if (!NewOp)
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case BLENDV: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
	Src3 = DAG.getBitcast(MaskVT, Src3);

	// Reverse the operands to match VSELECT order.
	return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
	}
	case VPERM_2OP : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);

	// Swap Src1 and Src2 in the node creation
	return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
	}
	case IFMA_OP:
	// NOTE: We need to swizzle the operands to pass the multiply operands
	// first.
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	FPclassMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}

	case CMP_MASK_CC: {
	MVT MaskVT = Op.getSimpleValueType();
	SDValue CC = Op.getOperand(3);
	CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(4);
	if (isRoundModeSAE(Sae))
	return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Sae);
	if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC);
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	if (!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	CmpMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8));
	else if (isRoundModeSAE(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8), Sae);
	else
	return SDValue();
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getConstant(0, dl, MVT::v16i1),
	FCmp, DAG.getIntPtrConstant(0, dl));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
	DAG.getBitcast(MVT::i16, Ins));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
	return Op.getOperand(1);

	// Avoid false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, VT);

	return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
	Mask);
	}
	case FIXUPIMM:
	case FIXUPIMM_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM)
	? Src1
	: getZeroVector(VT, Subtarget, DAG, dl);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

	if (Opc == X86ISD::VFIXUPIMM \|\| Opc == X86ISD::VFIXUPIMM_SAE)
	return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

	return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
	}
	case ROUNDP: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(2),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), RoundingMode);
	}
	case ROUNDS: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(3),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), RoundingMode);
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

	SDValue Res;
	// If the carry in is zero, then we should just use ADD/SUB instead of
	// ADC/SBB.
	if (isNullConstant(Op.getOperand(1))) {
	Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
	Op.getOperand(3));
	} else {
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
	DAG.getConstant(-1, dl, MVT::i8));
	Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
	Op.getOperand(3), GenCF.getValue(1));
	}
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Res };
	return DAG.getMergeValues(Results, dl);
	}
	case CVTPD2PS_MASK:
	case CVTPD2DQ_MASK:
	case CVTQQ2PS_MASK:
	case TRUNCATE_TO_REG: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
	Mask);
	}
	case CVTPS2PH_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue Rnd = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
	PassThru, Mask);

	}
	case CVTNEPS2BF16_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (ISD::isBuildVectorAllOnes(Mask.getNode()))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	// Break false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
	Mask);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	unsigned TestOpc = X86ISD::PTEST;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	// CF = 1
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
	SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::x86_sse42_pcmpistrm128:
	case Intrinsic::x86_sse42_pcmpestrm128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(getGlobalWrapperKind(), dl, VT,
	DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::eh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else { // Handles the SP or FP case.
	bool CantUseFP = RegInfo->needsStackRealignment(MF);
	if (CantUseFP)
	Reg = RegInfo->getPtrSizedStackRegister(MF);
	else
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	}
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}

	case Intrinsic::x86_avx512_vp2intersect_q_512:
	case Intrinsic::x86_avx512_vp2intersect_q_256:
	case Intrinsic::x86_avx512_vp2intersect_q_128:
	case Intrinsic::x86_avx512_vp2intersect_d_512:
	case Intrinsic::x86_avx512_vp2intersect_d_256:
	case Intrinsic::x86_avx512_vp2intersect_d_128: {
	MVT MaskVT = Op.getSimpleValueType();

	SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
	SDLoc DL(Op);

	SDValue Operation =
	DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
	Op->getOperand(1), Op->getOperand(2));

	SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
	MaskVT, Operation);
	SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
	MaskVT, Operation);
	return DAG.getMergeValues({Result0, Result1}, DL);
	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
	}

	static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	VT.getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the gather intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	Src.getSimpleValueType().getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the scatter intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return Res.getValue(1);
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsics with chain that return their
	/// value into registers EDX:EAX.
	/// If operand ScrReg is a valid register identifier, then operand 2 of N is
	/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
	/// TargetOpcode.
	/// Returns a Glue value which can be used to add extra copy-from-reg if the
	/// expanded intrinsics implicitly defines extra registers (i.e. not just
	/// EDX:EAX).
	static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	unsigned TargetOpcode,
	unsigned SrcReg,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDValue Chain = N->getOperand(0);
	SDValue Glue;

	if (SrcReg) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
	Glue = Chain.getValue(1);
	}

	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue N1Ops[] = {Chain, Glue};
	SDNode *N1 = DAG.getMachineNode(
	TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	SDValue LO, HI;
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);
	Glue = HI.getValue(2);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return Glue;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	return Glue;
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
	/* NoRegister */0, Subtarget,
	Results);
	if (Opcode != X86::RDTSCP)
	return;

	SDValue Chain = Results[1];
	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
	Results[1] = ecx;
	Results.push_back(ecx.getValue(1));
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 3> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	return SignedSat ?
	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Val, Ptr, Mask };
	return SignedSat ?
	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_rdpkru: {
	SDLoc dl(Op);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	// Create a RDPKRU node and pass 0 to the ECX parameter.
	return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_wrpkru: {
	SDLoc dl(Op);
	// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
	// to the EDX and ECX parameters.
	return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
	Op.getOperand(0), Op.getOperand(2),
	DAG.getConstant(0, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during FinalizeISel in EmitInstrWithCustomInserter.
	return SDValue();
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	case Intrinsic::x86_umwait:
	case Intrinsic::x86_tpause: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;

	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic");
	case Intrinsic::x86_umwait:
	Opcode = X86ISD::UMWAIT;
	break;
	case Intrinsic::x86_tpause:
	Opcode = X86ISD::TPAUSE;
	break;
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	Opcode = X86ISD::LWPINS;
	break;
	}

	SDValue Operation =
	DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	case Intrinsic::x86_enqcmd:
	case Intrinsic::x86_enqcmds: {
	SDLoc dl(Op);
	SDValue Chain = Op.getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic!");
	case Intrinsic::x86_enqcmd:
	Opcode = X86ISD::ENQCMD;
	break;
	case Intrinsic::x86_enqcmds:
	Opcode = X86ISD::ENQCMDS;
	break;
	}
	SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
	Op.getOperand(3));
	SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getConstant(X86::COND_B, dl, MVT::i8),
	SDValue(Result.getNode(), 1) };
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	SDValue Hint = Op.getOperand(6);
	unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC:
	// GetExtended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;

	// RDPMC uses ECX to select the index of the performance counter to read.
	// XGETBV uses ECX to select the index of the XCR register to return.
	// The result is stored into registers EDX:EAX.
	expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
	Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
	MemIntr->getMemOperand(), true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /SPOffset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const MachineFunction &MF = DAG.getMachineFunction();

	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	unsigned X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	unsigned X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, 2, 2);

	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other),
	Ops, MVT::i16, MMO);

	// Load FP Control Word from stack slot
	SDValue CWD =
	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
	DAG.getConstant(11, DL, MVT::i8));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));

	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i16,
	DAG.getNode(ISD::ADD, DL, MVT::i16,
	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
	DAG.getConstant(1, DL, MVT::i16)),
	DAG.getConstant(3, DL, MVT::i16));

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();
	MVT EltVT = VT.getVectorElementType();
	SDValue Src = Op.getOperand(0);
	assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
	"Src and Op should have the same element type!");

	// Extract the Lo/Hi vectors
	SDLoc dl(Op);
	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	// Decompose 512-bit ops into smaller 256-bit ops.
	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is512BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 512-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	/// Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (NumElems > 16 \|\|
	(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
	return LowerVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = DAG.getConstant(0, DL, CurrVT);

	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = Op0;
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ;
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
	}

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	}
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI() &&
	// vXi8 vectors need to be promoted to 512-bits for vXi32.
	(Subtarget.canExtendTo512DQ() \|\| VT.getVectorElementType() != MVT::i8))
	return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);

	assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits, dl, VT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	/// Break a 256-bit integer operation into two new 128-bit ones and then
	/// concatenate the result back.
	static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	/// Break a 512-bit integer operation into two new 256-bit ones and then
	/// concatenate the result back.
	static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is512BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32)
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return split256IntArith(Op, DAG);
	}

	static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	if (VT.getScalarType() == MVT::i1) {
	SDLoc dl(Op);
	switch (Opcode) {
	default: llvm_unreachable("Expected saturated arithmetic opcode");
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	// *addsat i1 X, Y --> X \| Y
	return DAG.getNode(ISD::OR, dl, VT, X, Y);
	case ISD::USUBSAT:
	case ISD::SSUBSAT:
	// *subsat i1 X, Y --> X & ~Y
	return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
	}
	}

	if (VT.is128BitVector()) {
	// Avoid the generic expansion with min/max if we don't have pminu/pmaxu.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), VT);
	SDLoc DL(Op);
	if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
	// uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
	}
	if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
	// usubsat X, Y --> (X >u Y) ? X - Y : 0
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
	}
	// Use default expansion.
	return SDValue();
	}

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return split256IntArith(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) {
	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0);
	SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
	}

	// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
	if ((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && Subtarget.hasSSE41()) {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Sub =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
	return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
	}

	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	assert(VT.isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntUnary(Op, DAG);
	}

	// Default to expand.
	return SDValue();
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// For AVX1 cases, split to use legal ops (everything but v4i64).
	if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
	return split256IntArith(Op, DAG);

	SDLoc DL(Op);
	unsigned Opcode = Op.getOpcode();
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);

	// For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
	// using the SMIN/SMAX instructions and flipping the signbit back.
	if (VT == MVT::v8i16) {
	assert((Opcode == ISD::UMIN \|\| Opcode == ISD::UMAX) &&
	"Unexpected MIN/MAX opcode");
	SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
	N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
	N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
	Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
	SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
	return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
	}

	// Else, expand to a compare/select.
	ISD::CondCode CC;
	switch (Opcode) {
	case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
	case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
	case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
	case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
	default: llvm_unreachable("Unknown MINMAX opcode");
	}

	SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
	return DAG.getSelect(DL, VT, Cond, N0, N1);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return split256IntArith(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
	}

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Extract the lo/hi parts to any extend to i16.
	// We're going to mask off the low byte of each result element of the
	// pmullw, so it doesn't matter what's in the high byte of each 16-bit
	// element.
	SDValue Undef = DAG.getUNDEF(VT);
	SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
	SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
	MVT::i16));
	HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
	MVT::i16));
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmulld is available!");

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, A),
	DAG.getBitcast(MVT::v2i64, B));
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, Aodds),
	DAG.getBitcast(MVT::v2i64, Bodds));

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");
	assert(!Subtarget.hasDQI() && "DQI should use MULLQ");

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	KnownBits AKnown = DAG.computeKnownBits(A);
	KnownBits BKnown = DAG.computeKnownBits(B);

	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
	bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
	bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

	SDValue Zero = DAG.getConstant(0, dl, VT);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	bool IsSigned = Op->getOpcode() == ISD::MULHS;
	unsigned NumElts = VT.getVectorNumElements();
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return split256IntArith(Op, DAG);

	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32) {
	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v16i32 && Subtarget.hasAVX512()));

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
	9, -1, 11, -1, 13, -1, 15, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
	makeArrayRef(&Mask[0], NumElts));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
	makeArrayRef(&Mask[0], NumElts));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
	unsigned Opcode =
	(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B)));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, Odd0),
	DAG.getBitcast(MulVT, Odd1)));

	// Shuffle it back into the right order.
	SmallVector<int, 16> ShufMask(NumElts);
	for (int i = 0; i != (int)NumElts; ++i)
	ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

	SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

	// If we have a signed multiply but no PMULDQ fix up the result of an
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
	}

	return Res;
	}

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
	Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	}

	// For signed 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8 && IsSigned)
	return split512IntArith(Op, DAG);

	// Signed AVX2 implementation - extend xmm subvectors to ymm.
	if (VT == MVT::v32i8 && IsSigned) {
	MVT ExVT = MVT::v16i16;
	SDValue ALo = extract128BitVector(A, 0, DAG, dl);
	SDValue BLo = extract128BitVector(B, 0, DAG, dl);
	SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
	SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
	ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
	BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
	AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
	BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
	SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
	Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);

	// Bitcast back to VT and then pack all the even elements from Lo and Hi.
	// Shuffle lowering should turn this into PACKUS+PERMQ
	Lo = DAG.getBitcast(VT, Lo);
	Hi = DAG.getBitcast(VT, Hi);
	return DAG.getVectorShuffle(VT, dl, Lo, Hi,
	{ 0, 2, 4, 6, 8, 10, 12, 14,
	16, 18, 20, 22, 24, 26, 28, 30,
	32, 34, 36, 38, 40, 42, 44, 46,
	48, 50, 52, 54, 56, 58, 60, 62});
	}

	// For signed v16i8 and all unsigned vXi8 we will unpack the low and high
	// half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
	// shift the results and pack the half lane results back together.

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};

	// Extract the lo parts and zero/sign extend to i16.
	// Only use SSE4.1 instructions for signed v16i8 where using unpack requires
	// shifts to sign extend. Using unpack for unsigned only requires an xor to
	// create zeros and a copy due to tied registers contraints pre-avx. But using
	// zero_extend_vector_inreg would require an additional pshufd for the high
	// part.

	SDValue ALo, AHi;
	if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);

	AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
	AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
	} else if (IsSigned) {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));

	ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
	AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
	} else {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	}

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh and extend.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	SDValue LoOp = B.getOperand(i + j);
	SDValue HiOp = B.getOperand(i + j + 8);

	if (IsSigned) {
	LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
	} else {
	LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
	}

	LoOps.push_back(LoOp);
	HiOps.push_back(HiOp);
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);

	BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
	BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
	} else if (IsSigned) {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));

	BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
	BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to vXi8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);

	// Bitcast back to VT and then pack all the even elements from Lo and Hi.
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MachinePointerInfo(), /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SRL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	APInt APIntShiftAmt;
	if (!isConstantSplat(Amt, APIntShiftAmt))
	return SDValue();

	// If the shift amount is out of range, return undef.
	if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
	return DAG.getUNDEF(VT);

	uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\| (Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = DAG.getConstant(0, dl, VT);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
	return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);

	if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
	MVT EltVT = VT.getVectorElementType();
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}

	// vXi8 shifts - shift as v8i16 + mask result.
	if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) \|\|
	(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) \|\|
	VT == MVT::v64i8) &&
	!Subtarget.hasXOP()) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
	if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
	unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
	unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	// Create the mask using vXi16 shifts. For shift-rights we need to move
	// the upper byte down before splatting the vXi8 mask.
	SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
	BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
	BaseShAmt, Subtarget, DAG);
	if (Opcode != ISD::SHL)
	BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
	8, DAG);
	BitMask = DAG.getBitcast(VT, BitMask);
	BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
	SmallVector<int, 64>(NumElts, 0));

	SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
	DAG.getBitcast(ExtVT, R), BaseShAmt,
	Subtarget, DAG);
	Res = DAG.getBitcast(VT, Res);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

	if (Opcode == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
	// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
	SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
	SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
	BaseShAmt, Subtarget, DAG);
	SignMask = DAG.getBitcast(VT, SignMask);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
	}
	return Res;
	}
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	// Convert a shift/rotate left amount to a multiplication scale factor.
	static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Amt.getSimpleValueType();
	if (!(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16) \|\|
	(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
	return SDValue();

	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	return DAG.getBuildVector(VT, dl, Elts);
	}

	// If the target doesn't support variable shifts, use either FP conversion
	// or integer multiplication to avoid shifting each element individually.
	if (VT == MVT::v4i32) {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
	DAG.getConstant(0x3f800000U, dl, VT));
	Amt = DAG.getBitcast(MVT::v4f32, Amt);
	return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
	}

	// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
	if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
	SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
	Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
	Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
	if (Subtarget.hasSSE41())
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

	return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
	DAG.getBitcast(VT, Hi),
	{0, 2, 4, 6, 8, 10, 12, 14});
	}

	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	unsigned Opc = Op.getOpcode();
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Opc))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() && (VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Opc == ISD::SRL \|\| Opc == ISD::SRA) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Opc == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Opc != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Opc == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a BLENDing shuffle instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes in parallel before blending.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue Amt1, Amt2;
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ShuffleMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue A = Amt->getOperand(i);
	if (A.isUndef()) {
	ShuffleMask.push_back(SM_SentinelUndef);
	continue;
	}
	if (!Amt1 \|\| Amt1 == A) {
	ShuffleMask.push_back(i);
	Amt1 = A;
	continue;
	}
	if (!Amt2 \|\| Amt2 == A) {
	ShuffleMask.push_back(i + NumElts);
	Amt2 = A;
	continue;
	}
	break;
	}

	// Only perform this blend if we can perform it without loading a mask.
	if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
	(VT != MVT::v16i16 \|\|
	is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
	(VT == MVT::v4i32 \|\| Subtarget.hasSSE41() \|\| Opc != ISD::SHL \|\|
	canWidenShuffleElements(ShuffleMask))) {
	auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
	auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
	if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
	Cst2->getAPIntValue().ult(EltSizeInBits)) {
	SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst1->getZExtValue(), DAG);
	SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst2->getZExtValue(), DAG);
	return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
	}
	}
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	if (Opc == ISD::SHL)
	if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
	return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

	// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
	if (Opc == ISD::SRL && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
	SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
	return DAG.getSelect(dl, VT, ZAmt, R, Res);
	}
	}

	// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
	// TODO: Special case handling for shift by 0/1, really we can afford either
	// of these cases in pre-SSE41/XOP/AVX512 but not both.
	if (Opc == ISD::SRA && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
	((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
	!Subtarget.hasAVX512()) \|\|
	DAG.isKnownNeverZero(Amt))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Amt0 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
	SDValue Amt1 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
	SDValue Sra1 =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
	SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
	Res = DAG.getSelect(dl, VT, Amt0, R, Res);
	return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. On AVX we're better off
	// just zero-extending, but for SSE just duplicating the top 16-bits is
	// cheaper and has the same effect for out of range values.
	if (Subtarget.hasAVX()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	} else {
	SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
	SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{4, 5, 6, 7, -1, -1, -1, -1});
	Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{2, 3, 3, 3, -1, -1, -1, -1});
	Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{2, 3, 3, 3, -1, -1, -1, -1});
	}
	}

	unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
	SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
	SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
	SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
	SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

	// Merge the shifted lane results optimally with/without PBLENDW.
	// TODO - ideally shuffle combining would handle this.
	if (Subtarget.hasSSE41()) {
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}
	SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
	SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	// NOTE: We honor prefered vector width before promoting to 512-bits.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) \|\|
	(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
	assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
	"Unexpected vector type");
	MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Opc, dl, ExtVT, R, Amt));
	}

	// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
	// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
	if (ConstantAmt && (Opc == ISD::SRA \|\| Opc == ISD::SRL) &&
	(VT == MVT::v16i8 \|\| VT == MVT::v64i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256())) &&
	!Subtarget.hasXOP()) {
	int NumElts = VT.getVectorNumElements();
	SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8);

	// Extend constant shift amount to vXi16 (it doesn't matter if the type
	// isn't legal).
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
	Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
	assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
	"Constant build vector expected");

	if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
	R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
	: DAG.getZExtOrTrunc(R, dl, ExVT);
	R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
	R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
	return DAG.getZExtOrTrunc(R, dl, VT);
	}

	SmallVector<SDValue, 16> LoAmt, HiAmt;
	for (int i = 0; i != NumElts; i += 16) {
	for (int j = 0; j != 8; ++j) {
	LoAmt.push_back(Amt.getOperand(i + j));
	HiAmt.push_back(Amt.getOperand(i + j + 8));
	}
	}

	MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
	SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
	SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

	SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
	SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
	LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
	LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
	HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
	LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
	ISD::SETGT);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, dl, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
	Amt = DAG.getBitcast(VT, Amt);

	if (Opc == ISD::SHL \|\| Opc == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Opc == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
	SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte meaning that we can safely pack with PACKUSWB.
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
	SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
	SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
	SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
	Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
	Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
	} else {
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into 128-bit shifts.
	if (VT.is256BitVector())
	return split256IntArith(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.isVector() && "Custom lowering only for vector rotates!");

	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	int NumElts = VT.getVectorNumElements();

	// Check for constant splat rotation amount.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	int CstSplatIndex = -1;
	if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
	for (int i = 0; i != NumElts; ++i)
	if (!UndefElts[i]) {
	if (CstSplatIndex < 0 \|\| EltBits[i] == EltBits[CstSplatIndex]) {
	CstSplatIndex = i;
	continue;
	}
	CstSplatIndex = -1;
	break;
	}

	// AVX512 implicitly uses modulo rotation amounts.
	if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
	// Attempt to rotate by immediate.
	if (0 <= CstSplatIndex) {
	unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
	return DAG.getNode(Op, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
	// XOP implicitly uses modulo rotation amounts.
	if (Subtarget.hasXOP()) {
	if (VT.is256BitVector())
	return split256IntArith(Op, DAG);
	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (0 <= CstSplatIndex) {
	uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
	return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}

	// Use general rotate by variable (per-element).
	return Op;
	}

	// Split 256-bit integers on pre-AVX2 targets.
	if (VT.is256BitVector() && !Subtarget.hasAVX2())
	return split256IntArith(Op, DAG);

	assert((VT == MVT::v4i32 \|\| VT == MVT::v8i16 \|\| VT == MVT::v16i8 \|\|
	((VT == MVT::v8i32 \|\| VT == MVT::v16i16 \|\| VT == MVT::v32i8) &&
	Subtarget.hasAVX2())) &&
	"Only vXi32/vXi16/vXi8 vector rotates supported");

	// Rotate by an uniform constant - expand back to shifts.
	if (0 <= CstSplatIndex)
	return SDValue();

	bool IsSplatAmt = DAG.isSplatValue(Amt);

	// v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
	// the amount bit.
	if (EltSizeInBits == 8 && !IsSplatAmt) {
	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
	return SDValue();

	// We don't need ModuloAmt here as we just peek at individual bits.
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, DL, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
	return DAG.getSelect(DL, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	// r = VSELECT(r, rot(r, 4), a);
	SDValue M;
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// r = VSELECT(r, rot(r, 2), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// return VSELECT(r, rot(r, 1), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
	return SignBitSelect(VT, Amt, M, R);
	}

	// ISD::ROT* uses modulo rotate amounts.
	Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));

	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
	bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
	SupportedVectorVarShift(VT, Subtarget, ISD::SRL);

	// Fallback for splats + all supported variable shifts.
	// Fallback for non-constants AVX2 vXi16 as well.
	if (IsSplatAmt \|\| LegalVarShifts \|\| (Subtarget.hasAVX2() && !ConstantAmt)) {
	SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
	AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
	SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
	SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
	return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
	}

	// As with shifts, convert the rotation amount to a multiplication factor.
	SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
	assert(Scale && "Failed to convert ROTL amount to scale");

	// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
	if (EltSizeInBits == 16) {
	SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
	SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
	// to v2i64 results at a time. The upper 32-bits contain the wrapped bits
	// that can then be OR'd with the lower 32-bits.
	assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
	static const int OddMask[] = {1, -1, 3, -1};
	SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
	SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

	SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R),
	DAG.getBitcast(MVT::v2i64, Scale));
	SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R13),
	DAG.getBitcast(MVT::v2i64, Scale13));
	Res02 = DAG.getBitcast(VT, Res02);
	Res13 = DAG.getBitcast(VT, Res13);

	return DAG.getNode(ISD::OR, DL, VT,
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
	if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();

	return false;
	}

	// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
	// TODO: In 32-bit mode, use FISTP when X87 is available?
	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	Type *MemType = SI->getValueOperand()->getType();

	bool NoImplicitFloatOps =
	SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
	return false;

	return needsCmpXchgNb(MemType);
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	Type *MemType = LI->getType();

	// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
	// can use movq to do the load. If we have X87 we can load into an 80-bit
	// X87 register and store it to a stack temporary.
	bool NoImplicitFloatOps =
	LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	(Subtarget.hasSSE2() \|\| Subtarget.hasX87()))
	return AtomicExpansionKind::None;

	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	case AtomicRMWInst::FAdd:
	case AtomicRMWInst::FSub:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	// If this is a canonical idempotent atomicrmw w/no uses, we have a better
	// lowering available in lowerAtomicArith.
	// TODO: push more cases through this path.
	if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
	if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
	AI->use_empty())
	return nullptr;

	auto Builder = IRBuilder<>(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded =
	Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
	AI->getType()->getPrimitiveSizeInBits());
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	/// Emit a locked operation on a stack location which does not change any
	/// memory location, but does involve a lock prefix. Location is chosen to be
	/// a) very likely accessed only by a single thread to minimize cache traffic,
	/// and b) definitely dereferenceable. Returns the new Chain result.
	static SDValue emitLockedStackOp(SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue Chain, SDLoc DL) {
	// Implementation notes:
	// 1) LOCK prefix creates a full read/write reordering barrier for memory
	// operations issued by the current processor. As such, the location
	// referenced is not relevant for the ordering properties of the instruction.
	// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
	// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
	// 2) Using an immediate operand appears to be the best encoding choice
	// here since it doesn't require an extra register.
	// 3) OR appears to be very slightly faster than ADD. (Though, the difference
	// is small enough it might just be measurement noise.)
	// 4) When choosing offsets, there are several contributing factors:
	// a) If there's no redzone, we default to TOS. (We could allocate a cache
	// line aligned stack object to improve this case.)
	// b) To minimize our chances of introducing a false dependence, we prefer
	// to offset the stack usage from TOS slightly.
	// c) To minimize concerns about cross thread stack usage - in particular,
	// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
	// captures state in the TOS frame and accesses it from many threads -
	// we want to use an offset such that the offset is in a distinct cache
	// line from the TOS frame.
	//
	// For a general discussion of the tradeoffs and benchmark results, see:
	// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

	auto &MF = DAG.getMachineFunction();
	auto &TFL = *Subtarget.getFrameLowering();
	const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

	if (Subtarget.is64Bit()) {
	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::RSP, MVT::i64), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i64), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
	SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	return emitLockedStackOp(DAG, Subtarget, Chain, dl);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
	cpOut, Success, EFLAGS.getValue(1));
	}

	// Create MOVMSKB, taking into account whether we need to split for AVX1.
	static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT InVT = V.getSimpleValueType();

	if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
	DAG.getConstant(16, DL, MVT::i8));
	return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
	}

	return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
	// half to v32i1 and concatenating the result.
	if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	assert(Subtarget.hasBWI() && "Expected BWI target");
	SDLoc dl(Op);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(1, dl));
	Hi = DAG.getBitcast(MVT::v32i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	}

	// Custom splitting for BWI types when AVX512F is available but BWI isn't.
	if ((SrcVT == MVT::v32i16 \|\| SrcVT == MVT::v64i8) && DstVT.isVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
	SDLoc dl(Op);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
	EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
	DstVT.getVectorNumElements() / 2);
	Lo = DAG.getBitcast(CastVT, Lo);
	Hi = DAG.getBitcast(CastVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
	}

	// Use MOVMSK for vector to scalar conversion to prevent scalarization.
	if ((SrcVT == MVT::v16i1 \|\| SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
	assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
	MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
	SDLoc DL(Op);
	SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	return DAG.getZExtOrTrunc(V, DL, DstVT);
	}

	if (SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
	!(DstVT == MVT::x86mmx && SrcVT.isVector()))
	// This conversion needs to be expanded.
	return SDValue();

	SDLoc dl(Op);
	if (SrcVT.isVector()) {
	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
	SrcVT.getVectorNumElements() * 2);
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
	DAG.getUNDEF(SrcVT));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
	}

	MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
	Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

	if (DstVT == MVT::x86mmx)
	return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
	Subtarget.hasMMX() && "Unexpected custom BITCAST");
	assert((DstVT == MVT::i64 \|\|
	(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
	"Unexpected custom BITCAST");
	// i64 <=> MMX conversions are Legal.
	if (SrcVT==MVT::i64 && DstVT.isVector())
	return Op;
	if (DstVT==MVT::i64 && SrcVT.isVector())
	return Op;
	// MMX <=> MMX conversions are Legal.
	if (SrcVT.isVector() && DstVT.isVector())
	return Op;
	// All other conversions need to be expanded.
	return SDValue();
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = DAG.getConstant(0, DL, VT);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
	SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	int NumElts = VT.getVectorNumElements();
	(void)EltVT;
	assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, VT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, VT);
	SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

	// Low nibbles
	SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
	SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
	return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	unsigned NumElems = VT.getVectorNumElements();
	assert((VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) && "Unexpected type");
	if (NumElems < 16 \|\| (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	// For element types greater than i8, do vXi8 pop counts and a bytesum.
	if (VT.getScalarType() != MVT::i8) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
	SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
	return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
	}

	// We can't use the fast LUT approach, so fall back on LegalizeDAG.
	if (!Subtarget.hasSSSE3())
	return SDValue();

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return Lower256IntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasXOP() && !VT.is512BitVector())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	// Specialized lowering for the canonical form of an idemptotent atomicrmw.
	// The core idea here is that since the memory location isn't actually
	// changing, all we need is a lowering for the ordering impacts of the
	// atomicrmw. As such, we can chose a different operation and memory
	// location to minimize impact on other code.
	if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
	// On X86, the only ordering which actually requires an instruction is
	// seq_cst which isn't SingleThread, everything just needs to be preserved
	// during codegen and then dropped. Note that we expect (but don't assume),
	// that orderings other than seq_cst and acq_rel have been canonicalized to
	// a store or load.
	if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
	AN->getSyncScopeID() == SyncScope::System) {
	// Prefer a locked operation against a stack location to minimize cache
	// traffic. This assumes that stack locations are very likely to be
	// accessed only by the owning thread.
	SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}
	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), LockOp.getValue(1));
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	SDLoc dl(Node);
	EVT VT = Node->getMemoryVT();

	bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
	bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

	// If this store is not sequentially consistent and the type is legal
	// we can just keep it.
	if (!IsSeqCst && IsTypeLegal)
	return Op;

	if (VT == MVT::i64 && !IsTypeLegal) {
	// For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
	// FIXME: Use movlps with SSE1.
	// FIXME: Use fist with X87.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	Subtarget.hasSSE2()) {
	SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Node->getOperand(2));
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
	Ops, MVT::i64,
	Node->getMemOperand());

	// If this is a sequentially consistent store, also emit an appropriate
	// barrier.
	if (IsSeqCst)
	Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

	return Chain;
	}
	}

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	Node->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	Node->getMemOperand());
	return Swap.getValue(1);
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = TLI.getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)VectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue Scale = N->getScale();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();

	if (VT == MVT::v2f32) {
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	// If the index is v2i64 and we have VLX we can use xmm for data and index.
	if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	return SDValue(NewScatter.getNode(), 1);
	}
	return SDValue();
	}

	if (VT == MVT::v2i32) {
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(MVT::v2i32));
	// If the index is v2i64 and we have VLX we can use xmm for data and index.
	if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
	SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	return SDValue(NewScatter.getNode(), 1);
	}
	// Custom widen all the operands to avoid promotion.
	EVT NewIndexVT = EVT::getVectorVT(
	*DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
	Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
	DAG.getUNDEF(Index.getValueType()));
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getConstant(0, dl, MVT::v2i1));
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
	Ops, N->getMemOperand());
	}

	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	// If the index is v2i32, we're being called by type legalization and we
	// should just let the default handling take care of it.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());
	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	Src = ExtendToType(Src, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	return SDValue(NewScatter.getNode(), 1);
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	MVT MaskVT = Mask.getSimpleValueType();
	SDValue PassThru = N->getPassThru();
	SDLoc dl(Op);

	// Handle AVX masked loads which don't support passthru other than 0.
	if (MaskVT.getVectorElementType() != MVT::i1) {
	// We also allow undef in the isel pattern.
	if (PassThru.isUndef() \|\| ISD::isBuildVectorAllZeros(PassThru.getNode()))
	return Op;

	SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
	N->getBasePtr(), Mask,
	getZeroVector(VT, Subtarget, DAG, dl),
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());
	// Emit a blend.
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
	PassThru);
	return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
	}

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	PassThru = ExtendToType(PassThru, WideDataVT, DAG);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
	N->getBasePtr(), Mask, PassThru,
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());

	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	Mask, N->getMemoryVT(), N->getMemOperand(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue PassThru = N->getPassThru();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	// If the index is v2i32, we're being called by type legalization.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	MVT OrigVT = VT;
	if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	!IndexVT.is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());

	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	PassThru = ExtendToType(PassThru, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
	N->getScale() };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
	NewGather, DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::FSHL:
	case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
	case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
	case ISD::FADD:
	case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	case ISD::USUBSAT:
	case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	return LowerGC_TRANSITION_START(Op, DAG);
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	// If the original node has one result, take the return value from
	// LowerOperation as is. It might not be result number 0.
	if (N->getNumValues() == 1) {
	Results.push_back(Res);
	return;
	}

	// If the original node has multiple results, then the return node should
	// have the same number of results.
	assert((N->getNumValues() == Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ReplaceNodeResults: ";
	N->dump(&DAG);
	#endif
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case ISD::CTPOP: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	// Use a v2i64 if possible.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
	SDValue Wide =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
	Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
	// Bit count should fit in 32-bits, extract it as that and then zero
	// extend to i64. Otherwise we end up extracting bits 63:32 separately.
	Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
	Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
	DAG.getIntPtrConstant(0, dl));
	Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
	Results.push_back(Wide);
	}
	return;
	}
	case ISD::MUL: {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Unexpected VT");
	if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
	VT.getVectorNumElements() == 2) {
	// Promote to a pattern that will be turned into PMULUDQ.
	SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
	N->getOperand(0));
	SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
	N->getOperand(1));
	SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
	} else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	VT.getVectorElementType() == MVT::i8) {
	// Pre-promote these to vXi16 to avoid op legalization thinking all 16
	// elements are needed.
	MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
	SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	unsigned NumConcats = 16 / VT.getVectorNumElements();
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
	Results.push_back(Res);
	}
	return;
	}
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	case ISD::USUBSAT:
	case ISD::SSUBSAT:
	case X86ISD::VPMADDWD:
	case X86ISD::AVG: {
	// Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
	// X86ISD::AVG/VPMADDWD by widening.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	EVT VT = N->getValueType(0);
	EVT InVT = N->getOperand(0).getValueType();
	assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
	"Expected a VT that divides into 128 bits.");
	unsigned NumConcat = 128 / InVT.getSizeInBits();

	EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(),
	NumConcat * InVT.getVectorNumElements());
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	NumConcat * VT.getVectorNumElements());

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

	SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	case ISD::ABS: {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	assert(N->getValueType(0) == MVT::i64 &&
	"Unexpected type (!= i64) on ABS.");
	MVT HalfT = MVT::i32;
	SDValue Lo, Hi, Tmp;
	SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);

	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(0, dl, HalfT));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(1, dl, HalfT));
	Tmp = DAG.getNode(
	ISD::SRA, dl, HalfT, Hi,
	DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
	TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
	Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
	Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
	SDValue(Lo.getNode(), 1));
	Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
	Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
	Results.push_back(Lo);
	Results.push_back(Hi);
	return;
	}
	case ISD::SETCC: {
	// Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
	// setCC result type is v2i1 because type legalzation will end up with
	// a v4i1 setcc plus an extend.
	assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
	if (N->getOperand(0).getValueType() != MVT::v2f32 \|\|
	getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
	return;
	SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
	N->getOperand(2));
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: {
	EVT VT = N->getValueType(0);
	if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
	// If this RHS is a constant splat vector we can widen this and let
	// division/remainder by constant optimize it.
	// TODO: Can we do something for non-splat?
	APInt SplatVal;
	if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
	unsigned NumConcats = 128 / VT.getSizeInBits();
	SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
	Ops0[0] = N->getOperand(0);
	EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
	SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
	SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
	Results.push_back(Res);
	}
	return;
	}

	if (VT == MVT::v2i32) {
	// Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
	// v2i64 and unroll later. But then we create i64 scalar ops which
	// might be slow in 64-bit mode or require a libcall in 32-bit mode.
	Results.push_back(DAG.UnrollVectorOp(N));
	return;
	}

	if (VT.isVector())
	return;

	LLVM_FALLTHROUGH;
	}
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::TRUNCATE: {
	MVT VT = N->getSimpleValueType(0);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;

	// The generic legalizer will try to widen the input type to the same
	// number of elements as the widened result type. But this isn't always
	// the best thing so do some custom legalization to avoid some cases.
	MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();

	unsigned InBits = InVT.getSizeInBits();
	if (128 % InBits == 0) {
	// 128 bit and smaller inputs should avoid truncate all together and
	// just use a build_vector that will become a shuffle.
	// TODO: Widen and use a shuffle directly?
	MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
	EVT EltVT = VT.getVectorElementType();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
	// Use the original element count so we don't do more scalar opts than
	// necessary.
	unsigned MinElts = VT.getVectorNumElements();
	for (unsigned i=0; i < MinElts; ++i) {
	SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
	DAG.getIntPtrConstant(i, dl));
	Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
	}
	Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
	return;
	}
	// With AVX512 there are some cases that can use a target specific
	// truncate node to go from 256/512 to less than 128 with zeros in the
	// upper elements of the 128 bit result.
	if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
	// We can use VTRUNC directly if for 256 bits with VLX or for any 512.
	if ((InBits == 256 && Subtarget.hasVLX()) \|\| InBits == 512) {
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	// There's one case we can widen to 512 bits and use VTRUNC.
	if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
	In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
	DAG.getUNDEF(MVT::v4i64));
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	}
	return;
	}
	case ISD::SIGN_EXTEND_VECTOR_INREG: {
	if (ExperimentalVectorWideningLegalization)
	return;

	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
	(InVT == MVT::v16i16 \|\| InVT == MVT::v32i8)) {
	// Custom split this so we can extend i8/i16->i32 invec. This is better
	// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
	// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
	// we allow the sra from the extend to i32 to be shared by the split.
	EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(),
	InVT.getVectorNumElements() / 2);
	MVT ExtendVT = MVT::getVectorVT(MVT::i32,
	VT.getVectorNumElements());
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
	In, DAG.getIntPtrConstant(0, dl));
	In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);

	// Fill a vector with sign bits for each element.
	SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
	SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	// Create an unpackl and unpackh to interleave the sign bits then bitcast
	// to vXi64.
	SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
	Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
	SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
	Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}
	return;
	}
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND: {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
	(InVT == MVT::v4i16 \|\| InVT == MVT::v4i8) &&
	getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
	// Custom split this so we can extend i8/i16->i32 invec. This is better
	// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
	// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
	// we allow the sra from the extend to i32 to be shared by the split.
	In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

	// Fill a vector with sign bits for each element.
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

	// Create an unpackl and unpackh to interleave the sign bits then bitcast
	// to v2i64.
	SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{0, 4, 1, 5});
	Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
	SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{2, 6, 3, 7});
	Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (VT == MVT::v16i32 \|\| VT == MVT::v8i64) {
	if (!InVT.is128BitVector()) {
	// Not a 128 bit vector, but maybe type legalization will promote
	// it to 128 bits.
	if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
	return;
	InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
	if (!InVT.is128BitVector())
	return;

	// Promote the input to 128 bits. Type legalization will turn this into
	// zext_inreg/sext_inreg.
	In = DAG.getNode(N->getOpcode(), dl, InVT, In);
	}

	// Perform custom splitting instead of the two stage extend we would get
	// by default.
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	assert(isTypeLegal(LoVT) && "Split VT not legal?");

	SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);

	// We need to shift the input over by half the number of elements.
	unsigned NumElts = InVT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != HalfNumElts; ++i)
	ShufMask[i] = i + HalfNumElts;

	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	}
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();

	// Promote these manually to avoid over promotion to v2i64. Type
	// legalization will revisit the v2i32 operation for more cleanup.
	if ((VT == MVT::v2i8 \|\| VT == MVT::v2i16) &&
	getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
	// AVX512DQ provides instructions that produce a v2i64 result.
	if (Subtarget.hasDQI())
	return;

	SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
	Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
	: ISD::AssertSext,
	dl, MVT::v2i32, Res,
	DAG.getValueType(VT.getVectorElementType()));
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	Results.push_back(Res);
	return;
	}

	if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;

	// Try to create a 128 bit vector, but don't exceed a 32 bit element.
	unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
	MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
	VT.getVectorNumElements());
	SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

	// Preserve what we know about the size of the original result. Except
	// when the result is v2i32 since we can't widen the assert.
	if (PromoteVT != MVT::v2i32)
	Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
	: ISD::AssertSext,
	dl, PromoteVT, Res,
	DAG.getValueType(VT.getVectorElementType()));

	// Truncate back to the original width.
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

	// Now widen to 128 bits.
	unsigned NumConcats = 128 / VT.getSizeInBits();
	MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
	VT.getVectorNumElements() * NumConcats);
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
	Results.push_back(Res);
	return;
	}


	if (VT == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	bool Widenv2i32 =
	getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
	if (Src.getValueType() == MVT::v2f64) {
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// If v2i32 is widened, we can defer to the generic legalizer.
	if (Widenv2i32)
	return;
	// Custom widen by doubling to a legal vector with. Isel will
	// further widen to v8f64.
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
	Src, DAG.getUNDEF(MVT::v2f64));
	}
	SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
	if (!Widenv2i32)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	if (SrcVT == MVT::v2f32 &&
	getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
	: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	if (Subtarget.hasDQI() && VT == MVT::i64 &&
	(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)) {
	assert(!Subtarget.is64Bit() && "i64 should be legal");
	unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
	// Using a 256-bit input here to guarantee 128-bit input for f32 case.
	// TODO: Use 128-bit vectors for f64 case?
	// TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
	MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
	DAG.getConstantFP(0.0, dl, VecInVT), Src,
	ZeroIdx);
	Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
	Results.push_back(Res);
	return;
	}

	if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
	Results.push_back(V);
	return;
	}
	case ISD::SINT_TO_FP: {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = N->getOperand(0);
	if (N->getValueType(0) != MVT::v2f32 \|\| Src.getValueType() != MVT::v2i64)
	return;
	Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
	return;
	}
	case ISD::UINT_TO_FP: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
	return;
	}
	if (SrcVT != MVT::v2i32)
	return;
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	return;
	}
	case ISD::FP_ROUND: {
	if (!isTypeLegal(N->getOperand(0).getValueType()))
	return;
	SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	return;
	}
	case ISD::FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
	Results);
	return;
	case Intrinsic::x86_xgetbv:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
	Results);
	return;
	}
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	assert((!Regs64bit \|\| Subtarget.hasCmpxchg16b()) &&
	"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	unsigned BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_LOAD: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
	auto *Node = cast<AtomicSDNode>(N);
	if (Subtarget.hasSSE2()) {
	// Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
	// lower 64-bits.
	SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Node->getMemOperand());
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Ld.getValue(1));
	return;
	}
	if (Subtarget.hasX87()) {
	// First load this into an 80-bit X87 register. This will put the whole
	// integer into the significand.
	// FIXME: Do we need to glue? See FIXME comment in BuildFILD.
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
	dl, Tys, Ops, MVT::i64,
	Node->getMemOperand());
	SDValue Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// Now store the X87 register to a stack temporary and convert to i64.
	// This store is not atomic and doesn't need to be.
	// FIXME: We don't need a stack temporary if the result of the load
	// is already being stored. We could just directly store there.
	SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
	Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
	DAG.getVTList(MVT::Other), StoreOps,
	MVT::i64, MPI, 0 /Align/,
	MachineMemOperand::MOStore);

	// Finally load the value back from the stack temporary and return it.
	// This load is not atomic and doesn't need to be.
	// This load will be further type legalized.
	Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
	Results.push_back(Result);
	Results.push_back(Result.getValue(1));
	return;
	}
	}
	// TODO: Use MOVLPS when SSE1 is available?
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;

	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();

	// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
	// we can split using the k-register rather than memory.
	if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	Lo = DAG.getBitcast(MVT::i32, Lo);
	Hi = DAG.getBitcast(MVT::i32, Hi);
	SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	// Custom splitting for BWI types when AVX512F is available but BWI isn't.
	if ((DstVT == MVT::v32i16 \|\| DstVT == MVT::v64i8) &&
	SrcVT.isVector() && isTypeLegal(SrcVT)) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
	Lo = DAG.getBitcast(CastVT, Lo);
	Hi = DAG.getBitcast(CastVT, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (SrcVT != MVT::f64 \|\|
	(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) \|\|
	getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
	return;

	unsigned NumElts = DstVT.getVectorNumElements();
	EVT SVT = DstVT.getVectorElementType();
	EVT WiderVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue Res;
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
	Res = DAG.getBitcast(WiderVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	case ISD::MGATHER: {
	EVT VT = N->getValueType(0);
	if (VT == MVT::v2f32 && (Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	if (Index.getValueType() != MVT::v2i64)
	return;
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	Gather->getPassThru(),
	DAG.getUNDEF(MVT::v2f32));
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(2));
	return;
	}
	if (VT == MVT::v2i32) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
	Gather->getPassThru(),
	DAG.getUNDEF(MVT::v2i32));
	// If the index is v2i64 we can use it directly.
	if (Index.getValueType() == MVT::v2i64 &&
	(Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	SDValue Chain = Res.getValue(2);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
	EVT IndexVT = Index.getValueType();
	EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
	IndexVT.getScalarType(), 4);
	// Otherwise we need to custom widen everything to avoid promotion.
	Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
	DAG.getUNDEF(IndexVT));
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getConstant(0, dl, MVT::v2i1));
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
	Gather->getMemoryVT(), dl, Ops,
	Gather->getMemOperand());
	SDValue Chain = Res.getValue(1);
	if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	}
	return;
	}
	case ISD::LOAD: {
	// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
	// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
	// cast since type legalization will try to use an i64 load.
	MVT VT = N->getSimpleValueType(0);
	assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;
	if (!ISD::isNON_EXTLoad(N))
	return;
	auto *Ld = cast<LoadSDNode>(N);
	if (Subtarget.hasSSE2()) {
	MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
	SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue Chain = Res.getValue(1);
	MVT WideVT = MVT::getVectorVT(LdVT, 2);
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
	MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() * 2);
	Res = DAG.getBitcast(CastVT, Res);
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
	SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
	SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Ld->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	case X86ISD::BSF: return "X86ISD::BSF";
	case X86ISD::BSR: return "X86ISD::BSR";
	case X86ISD::SHLD: return "X86ISD::SHLD";
	case X86ISD::SHRD: return "X86ISD::SHRD";
	case X86ISD::FAND: return "X86ISD::FAND";
	case X86ISD::FANDN: return "X86ISD::FANDN";
	case X86ISD::FOR: return "X86ISD::FOR";
	case X86ISD::FXOR: return "X86ISD::FXOR";
	case X86ISD::FILD: return "X86ISD::FILD";
	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
	case X86ISD::FIST: return "X86ISD::FIST";
	case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
	case X86ISD::FLD: return "X86ISD::FLD";
	case X86ISD::FST: return "X86ISD::FST";
	case X86ISD::CALL: return "X86ISD::CALL";
	case X86ISD::BT: return "X86ISD::BT";
	case X86ISD::CMP: return "X86ISD::CMP";
	case X86ISD::COMI: return "X86ISD::COMI";
	case X86ISD::UCOMI: return "X86ISD::UCOMI";
	case X86ISD::CMPM: return "X86ISD::CMPM";
	case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
	case X86ISD::SETCC: return "X86ISD::SETCC";
	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
	case X86ISD::FSETCC: return "X86ISD::FSETCC";
	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
	case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
	case X86ISD::CMOV: return "X86ISD::CMOV";
	case X86ISD::BRCOND: return "X86ISD::BRCOND";
	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
	case X86ISD::IRET: return "X86ISD::IRET";
	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
	case X86ISD::Wrapper: return "X86ISD::Wrapper";
	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
	case X86ISD::PINSRB: return "X86ISD::PINSRB";
	case X86ISD::PINSRW: return "X86ISD::PINSRW";
	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
	case X86ISD::ANDNP: return "X86ISD::ANDNP";
	case X86ISD::BLENDI: return "X86ISD::BLENDI";
	case X86ISD::BLENDV: return "X86ISD::BLENDV";
	case X86ISD::HADD: return "X86ISD::HADD";
	case X86ISD::HSUB: return "X86ISD::HSUB";
	case X86ISD::FHADD: return "X86ISD::FHADD";
	case X86ISD::FHSUB: return "X86ISD::FHSUB";
	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
	case X86ISD::FMAX: return "X86ISD::FMAX";
	case X86ISD::FMAXS: return "X86ISD::FMAXS";
	case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
	case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
	case X86ISD::FMIN: return "X86ISD::FMIN";
	case X86ISD::FMINS: return "X86ISD::FMINS";
	case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
	case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
	case X86ISD::FMAXC: return "X86ISD::FMAXC";
	case X86ISD::FMINC: return "X86ISD::FMINC";
	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
	case X86ISD::FRCP: return "X86ISD::FRCP";
	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
	case X86ISD::LADD: return "X86ISD::LADD";
	case X86ISD::LSUB: return "X86ISD::LSUB";
	case X86ISD::LOR: return "X86ISD::LOR";
	case X86ISD::LXOR: return "X86ISD::LXOR";
	case X86ISD::LAND: return "X86ISD::LAND";
	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
	case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
	case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";
	case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";
	case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";
	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
	case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
	case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
	case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
	case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
	case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
	case X86ISD::VSHL: return "X86ISD::VSHL";
	case X86ISD::VSRL: return "X86ISD::VSRL";
	case X86ISD::VSRA: return "X86ISD::VSRA";
	case X86ISD::VSHLI: return "X86ISD::VSHLI";
	case X86ISD::VSRLI: return "X86ISD::VSRLI";
	case X86ISD::VSRAI: return "X86ISD::VSRAI";
	case X86ISD::VSHLV: return "X86ISD::VSHLV";
	case X86ISD::VSRLV: return "X86ISD::VSRLV";
	case X86ISD::VSRAV: return "X86ISD::VSRAV";
	case X86ISD::VROTLI: return "X86ISD::VROTLI";
	case X86ISD::VROTRI: return "X86ISD::VROTRI";
	case X86ISD::VPPERM: return "X86ISD::VPPERM";
	case X86ISD::CMPP: return "X86ISD::CMPP";
	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
	case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
	case X86ISD::ADD: return "X86ISD::ADD";
	case X86ISD::SUB: return "X86ISD::SUB";
	case X86ISD::ADC: return "X86ISD::ADC";
	case X86ISD::SBB: return "X86ISD::SBB";
	case X86ISD::SMUL: return "X86ISD::SMUL";
	case X86ISD::UMUL: return "X86ISD::UMUL";
	case X86ISD::OR: return "X86ISD::OR";
	case X86ISD::XOR: return "X86ISD::XOR";
	case X86ISD::AND: return "X86ISD::AND";
	case X86ISD::BEXTR: return "X86ISD::BEXTR";
	case X86ISD::BZHI: return "X86ISD::BZHI";
	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
	case X86ISD::PTEST: return "X86ISD::PTEST";
	case X86ISD::TESTP: return "X86ISD::TESTP";
	case X86ISD::KORTEST: return "X86ISD::KORTEST";
	case X86ISD::KTEST: return "X86ISD::KTEST";
	case X86ISD::KADD: return "X86ISD::KADD";
	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
	case X86ISD::PACKSS: return "X86ISD::PACKSS";
	case X86ISD::PACKUS: return "X86ISD::PACKUS";
	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
	case X86ISD::VALIGN: return "X86ISD::VALIGN";
	case X86ISD::VSHLD: return "X86ISD::VSHLD";
	case X86ISD::VSHRD: return "X86ISD::VSHRD";
	case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
	case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
	case X86ISD::SHUFP: return "X86ISD::SHUFP";
	case X86ISD::SHUF128: return "X86ISD::SHUF128";
	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
	case X86ISD::MOVSD: return "X86ISD::MOVSD";
	case X86ISD::MOVSS: return "X86ISD::MOVSS";
	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
	case X86ISD::VPERMV: return "X86ISD::VPERMV";
	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
	case X86ISD::VPERMI: return "X86ISD::VPERMI";
	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
	case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
	case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
	case X86ISD::VRANGE: return "X86ISD::VRANGE";
	case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
	case X86ISD::VRANGES: return "X86ISD::VRANGES";
	case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
	case X86ISD::PSADBW: return "X86ISD::PSADBW";
	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
	case X86ISD::MFENCE: return "X86ISD::MFENCE";
	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
	case X86ISD::SAHF: return "X86ISD::SAHF";
	case X86ISD::RDRAND: return "X86ISD::RDRAND";
	case X86ISD::RDSEED: return "X86ISD::RDSEED";
	case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
	case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
	case X86ISD::VPSHA: return "X86ISD::VPSHA";
	case X86ISD::VPSHL: return "X86ISD::VPSHL";
	case X86ISD::VPCOM: return "X86ISD::VPCOM";
	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
	case X86ISD::FMSUB: return "X86ISD::FMSUB";
	case X86ISD::FNMADD: return "X86ISD::FNMADD";
	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
	case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
	case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
	case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
	case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
	case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
	case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
	case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
	case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
	case X86ISD::XTEST: return "X86ISD::XTEST";
	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
	case X86ISD::EXPAND: return "X86ISD::EXPAND";
	case X86ISD::SELECTS: return "X86ISD::SELECTS";
	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
	case X86ISD::RCP14: return "X86ISD::RCP14";
	case X86ISD::RCP14S: return "X86ISD::RCP14S";
	case X86ISD::RCP28: return "X86ISD::RCP28";
	case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
	case X86ISD::RCP28S: return "X86ISD::RCP28S";
	case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
	case X86ISD::EXP2: return "X86ISD::EXP2";
	case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
	case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
	case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
	case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
	case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
	case X86ISD::FADDS: return "X86ISD::FADDS";
	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
	case X86ISD::FSUBS: return "X86ISD::FSUBS";
	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
	case X86ISD::FMULS: return "X86ISD::FMULS";
	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
	case X86ISD::FDIVS: return "X86ISD::FDIVS";
	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
	case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
	case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
	case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
	case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
	case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
	case X86ISD::SCALEF: return "X86ISD::SCALEF";
	case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
	case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
	case X86ISD::AVG: return "X86ISD::AVG";
	case X86ISD::MULHRS: return "X86ISD::MULHRS";
	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
	case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
	case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
	case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
	case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
	case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
	case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
	case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
	case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
	case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
	case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
	case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
	case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
	case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
	case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
	case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
	case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";
	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
	case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";
	case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
	case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
	case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
	case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
	case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
	case X86ISD::LWPINS: return "X86ISD::LWPINS";
	case X86ISD::MGATHER: return "X86ISD::MGATHER";
	case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
	case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
	case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
	case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
	case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
	case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
	case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
	case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
	case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
	case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
	case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
	case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
	case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
	case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
	case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
	case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
	}
	return nullptr;
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
	if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
	(Bits == 8 \|\| Bits == 16 \|\| Bits == 32 \|\| Bits == 64))
	return false;

	// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
	// shifts just as cheap as scalar ones.
	if (Subtarget.hasAVX2() && (Bits == 32 \|\| Bits == 64))
	return false;

	// AVX512BW has shifts such as vpsllvw.
	if (Subtarget.hasBWI() && Bits == 16)
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// These are non-commutative binops.
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::ANDNP:
	case X86ISD::PCMPGT:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case X86ISD::FANDN:
	return true;
	}

	return TargetLoweringBase::isBinOp(Opcode);
	}

	bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::PCMPEQ:
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ:
	case X86ISD::FMAXC:
	case X86ISD::FMINC:
	case X86ISD::FAND:
	case X86ISD::FOR:
	case X86ISD::FXOR:
	return true;
	}

	return TargetLoweringBase::isCommutativeBinOp(Opcode);
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
	EVT SrcVT = ExtVal.getOperand(0).getValueType();

	// There is no extending load for vXi1.
	if (SrcVT.getScalarType() == MVT::i1)
	return false;

	return true;
	}

	bool
	X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
	EVT VT) const {
	// Don't convert an 'and' into a shuffle that we don't directly support.
	// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
	if (!Subtarget.hasAVX2())
	if (VT == MVT::v32i8 \|\| VT == MVT::v16i16)
	return false;

	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
	// If the subtarget is using retpolines, we need to not generate jump tables.
	if (Subtarget.useRetpolineIndirectBranches())
	return false;

	// Otherwise, fallback on the generic logic.
	return TargetLowering::areJTsAllowed(Fn);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	unsigned DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	unsigned Align = MI.getOperand(8).getImm();

	MachineFunction *MF = MBB->getParent();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");

	MachineMemOperand *OldMMO = MI.memoperands().front();

	// Clone the MMO into two separate MMOs for loading and storing
	MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
	MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Align > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
	.addMBB(overflowMBB).addImm(X86::COND_AE);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Zero-extend the offset
	unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(StoreOnlyMMO);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
	unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Align-1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Align-1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(StoreOnlyMMO);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, /Align=/16);
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of EFLAGS.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return false;
	if (mi.definesRegister(X86::EFLAGS))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
	// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
	// the last PHI function inserted.
	static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
	MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock TrueMBB, MachineBasicBlock FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	DebugLoc DL = MIItBegin->getDebugLoc();

	X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	unsigned DestReg = MIIt->getOperand(0).getReg();
	unsigned Op1Reg = MIIt->getOperand(1).getReg();
	unsigned Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(FalseMBB)
	.addReg(Op2Reg)
	.addMBB(TrueMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	return MIB;
	}

	// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
	MachineInstr &SecondCascadedCMOV,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = FirstCMOV.getDebugLoc();

	// We lower cascaded CMOVs such as
	//
	// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
	//
	// to two successive branches.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//

	// We lower cascaded CMOV into two successive branches to the same block.
	// EFLAGS is used by both, so mark it as live in the second.
	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FirstInsertedMBB);
	F->insert(It, SecondInsertedMBB);
	F->insert(It, SinkMBB);

	// For a cascaded CMOV, we lower it to two successive branches to
	// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
	// the FirstInsertedMBB.
	FirstInsertedMBB->addLiveIn(X86::EFLAGS);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
	SecondInsertedMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(FirstCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FirstInsertedMBB);
	// The true block target of the first branch is always SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
	// The true block for the branch of FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SinkMBB);
	// This is fallthrough.
	SecondInsertedMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instructions.
	X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

	X86::CondCode SecondCC =
	X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
	BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
	unsigned DestReg = FirstCMOV.getOperand(0).getReg();
	unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
	unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(SecondInsertedMBB)
	.addReg(Op2Reg)
	.addMBB(ThisMBB);

	// The second SecondInsertedMBB provides the same incoming value as the
	// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
	MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
	TII->get(TargetOpcode::COPY),
	SecondCascadedCMOV.getOperand(0).getReg())
	.addReg(FirstCMOV.getOperand(0).getReg());

	// Now remove the CMOVs.
	FirstCMOV.eraseFromParent();
	SecondCascadedCMOV.eraseFromParent();

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between and a branch opcode to use.

	// ThisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> FalseMBB

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2:
	// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
	// function - EmitLoweredCascadedSelect.

	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineInstr *LastCMOV = &MI;
	MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition. Skip over
	// intervening debug insts.
	while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	++NextMIIt;
	NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
	}

	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FalseMBB);
	F->insert(It, SinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!LastCMOV->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
	FalseMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer any debug instructions inside the CMOV sequence to the sunk block.
	auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
	auto DbgIt = MachineBasicBlock::iterator(MI);
	while (DbgIt != DbgEnd) {
	auto Next = std::next(DbgIt);
	if (DbgIt->isDebugInstr())
	SinkMBB->push_back(DbgIt->removeFromParent());
	DbgIt = Next;
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->end(), ThisMBB,
	std::next(MachineBasicBlock::iterator(LastCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FalseMBB);
	// The true block target of the first (or only) branch is always a SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FalseMBB.
	FalseMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instruction.
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

	// Now remove the CMOV(s).
	ThisMBB->erase(MIItBegin, MIItEnd);

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const Constant *PerFn = MF->getFunction().getPersonalityFn();
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
	// Only 32-bit SEH requires special handling for catchpad.
	if (IsSEH && Subtarget.is32Bit()) {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
	}
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
	switch (RPOpc) {
	case X86::RETPOLINE_CALL32:
	return X86::CALLpcrel32;
	case X86::RETPOLINE_CALL64:
	return X86::CALL64pcrel32;
	case X86::RETPOLINE_TCRETURN32:
	return X86::TCRETURNdi;
	case X86::RETPOLINE_TCRETURN64:
	return X86::TCRETURNdi64;
	}
	llvm_unreachable("not retpoline opcode");
	}

	static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
	unsigned Reg) {
	if (Subtarget.useRetpolineExternalThunk()) {
	// When using an external thunk for retpolines, we pick names that match the
	// names GCC happens to use as well. This helps simplify the implementation
	// of the thunks for kernels where they have no easy ability to create
	// aliases and are doing non-trivial configuration of the thunk's body. For
	// example, the Linux kernel will do boot-time hot patching of the thunk
	// bodies and cannot easily export aliases of these to loaded modules.
	//
	// Note that at any point in the future, we may need to change the semantics
	// of how we implement retpolines and at that time will likely change the
	// name of the called thunk. Essentially, there is no hard guarantee that
	// LLVM will generate calls to specific thunks, we merely make a best-effort
	// attempt to help out kernels and other systems where duplicating the
	// thunks is costly.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__x86_indirect_thunk_r11";
	}
	llvm_unreachable("unexpected reg for retpoline");
	}

	// When targeting an internal COMDAT thunk use an LLVM-specific name.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__llvm_retpoline_r11";
	}
	llvm_unreachable("unexpected reg for retpoline");
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Copy the virtual register into the R11 physical register and
	// call the retpoline thunk.
	DebugLoc DL = MI.getDebugLoc();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	unsigned CalleeVReg = MI.getOperand(0).getReg();
	unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());

	// Find an available scratch register to hold the callee. On 64-bit, we can
	// just use R11, but we scan for uses anyway to ensure we don't generate
	// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
	// already a register use operand to the call to hold the callee. If none
	// are available, use EDI instead. EDI is chosen because EBX is the PIC base
	// register and ESI is the base pointer to realigned stack frames with VLAs.
	SmallVector<unsigned, 3> AvailableRegs;
	if (Subtarget.is64Bit())
	AvailableRegs.push_back(X86::R11);
	else
	AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

	// Zero out any registers that are already used.
	for (const auto &MO : MI.operands()) {
	if (MO.isReg() && MO.isUse())
	for (unsigned &Reg : AvailableRegs)
	if (Reg == MO.getReg())
	Reg = 0;
	}

	// Choose the first remaining non-zero available register.
	unsigned AvailableReg = 0;
	for (unsigned MaybeReg : AvailableRegs) {
	if (MaybeReg) {
	AvailableReg = MaybeReg;
	break;
	}
	}
	if (!AvailableReg)
	report_fatal_error("calling convention incompatible with retpoline, no "
	"available registers");

	const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);

	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
	.addReg(CalleeVReg);
	MI.getOperand(0).ChangeToES(Symbol);
	MI.setDesc(TII->get(Opc));
	MachineInstrBuilder(*BB->getParent(), &MI)
	.addReg(AvailableReg, RegState::Implicit \| RegState::Kill);
	return BB;
	}

	/// SetJmp implies future control flow change upon calling the corresponding
	/// LongJmp.
	/// Instead of using the 'return' instruction, the long jump fixes the stack and
	/// performs an indirect branch. To do so it uses the registers that were stored
	/// in the jump buffer (when calling SetJmp).
	/// In case the shadow stack is enabled we need to fix it as well, because some
	/// return addresses will be skipped.
	/// The function will save the SSP for future fixing in the function
	/// emitLongJmpShadowStackFix.
	/// \sa emitLongJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineInstrBuilder MIB;

	// Memory Reference.
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	// Initialize a register with zero.
	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	unsigned ZReg = MRI.createVirtualRegister(PtrRC);
	unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
	BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
	.addDef(ZReg)
	.addReg(ZReg, RegState::Undef)
	.addReg(ZReg, RegState::Undef);

	// Read the current SSP Register value to the zeroed register.
	unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Write the SSP register value to offset 3 in input memory buffer.
	unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
	const int64_t SSPOffset = 3 * PVT.getStoreSize();
	const unsigned MemOpndSlot = 1;
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	MIB.addReg(SSPCopyReg);
	MIB.setMemRefs(MMOs);
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOs);

	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	emitSetJmpShadowStackFix(MI, thisMBB);
	}

	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	unsigned FramePtr = RegInfo->getFrameRegister(*MF);
	unsigned BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	/// Fix the shadow stack using the previously saved SSP pointer.
	/// \sa emitSetJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	/// \return The sink MBB that will perform the future indirect branch.
	MachineBasicBlock *
	X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

	// checkSspMBB:
	// xor vreg1, vreg1
	// rdssp vreg1
	// test vreg1, vreg1
	// je sinkMBB # Jump if Shadow Stack is not supported
	// fallMBB:
	// mov buf+24/12(%rip), vreg2
	// sub vreg1, vreg2
	// jbe sinkMBB # No need to fix the Shadow Stack
	// fixShadowMBB:
	// shr 3/2, vreg2
	// incssp vreg2 # fix the SSP according to the lower 8 bits
	// shr 8, vreg2
	// je sinkMBB
	// fixShadowLoopPrepareMBB:
	// shl vreg2
	// mov 128, vreg3
	// fixShadowLoopMBB:
	// incssp vreg3
	// dec vreg2
	// jne fixShadowLoopMBB # Iterate until you finish fixing
	// # the Shadow Stack
	// sinkMBB:

	MachineFunction::iterator I = ++MBB->getIterator();
	const BasicBlock *BB = MBB->getBasicBlock();

	MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, checkSspMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, fixShadowMBB);
	MF->insert(I, fixShadowLoopPrepareMBB);
	MF->insert(I, fixShadowLoopMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
	MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MBB->addSuccessor(checkSspMBB);

	// Initialize a register with zero.
	unsigned ZReg = MRI.createVirtualRegister(PtrRC);
	unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
	BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
	.addDef(ZReg)
	.addReg(ZReg, RegState::Undef)
	.addReg(ZReg, RegState::Undef);

	// Read the current SSP Register value to the zeroed register.
	unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Check whether the result of the SSP register is zero and jump directly
	// to the sink.
	unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
	BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
	.addReg(SSPCopyReg)
	.addReg(SSPCopyReg);
	BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	checkSspMBB->addSuccessor(sinkMBB);
	checkSspMBB->addSuccessor(fallMBB);

	// Reload the previously saved SSP register value.
	unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	const int64_t SPPOffset = 3 * PVT.getStoreSize();
	MachineInstrBuilder MIB =
	BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, SPPOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Subtract the current SSP from the previous SSP.
	unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
	unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
	BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
	.addReg(PrevSSPReg)
	.addReg(SSPCopyReg);

	// Jump to sink in case PrevSSPReg <= SSPCopyReg.
	BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
	fallMBB->addSuccessor(sinkMBB);
	fallMBB->addSuccessor(fixShadowMBB);

	// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
	unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
	unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
	unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
	.addReg(SspSubReg)
	.addImm(Offset);

	// Increase SSP when looking only on the lower 8 bits of the delta.
	unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
	BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

	// Reset the lower 8 bits.
	unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
	.addReg(SspFirstShrReg)
	.addImm(8);

	// Jump if the result of the shift is zero.
	BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	fixShadowMBB->addSuccessor(sinkMBB);
	fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

	// Do a single shift left.
	unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
	unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
	.addReg(SspSecondShrReg);

	// Save the value 128 to a register (will be used next with incssp).
	unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
	unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
	.addImm(128);
	fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

	// Since incssp only looks at the lower 8 bits, we might need to do several
	// iterations of incssp until we finish fixing the shadow stack.
	unsigned DecReg = MRI.createVirtualRegister(PtrRC);
	unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
	.addReg(SspAfterShlReg)
	.addMBB(fixShadowLoopPrepareMBB)
	.addReg(DecReg)
	.addMBB(fixShadowLoopMBB);

	// Every iteration we increase the SSP by 128.
	BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

	// Every iteration we decrement the counter by 1.
	unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
	BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

	// Jump if the counter is not zero yet.
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
	fixShadowLoopMBB->addSuccessor(sinkMBB);
	fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	unsigned Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	unsigned SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	MachineBasicBlock *thisMBB = MBB;

	// When CET and shadow stack is enabled, we need to fix the Shadow Stack.
	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
	}

	// Reload FP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, LabelOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload SP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
	// the last instruction of the expansion.
	}
	MIB.setMemRefs(MMOs);

	// Jump
	BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return thisMBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MF->getFrameInfo().getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugInstr())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	unsigned JTE = getJumpTableEncoding();
	MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	unsigned FP = RI.getFrameRegister(*MF);
	unsigned BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	// IReg is used as an index in a memory operand and therefore can't be SP
	unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	Subtarget.is64Bit() ? 8 : 4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

	if (Subtarget.is64Bit()) {
	unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

	// leaq .LJTI0_0(%rip), BReg
	BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	// movzx IReg64, IReg
	BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
	.addImm(0)
	.addReg(IReg)
	.addImm(X86::sub_32bit);

	switch (JTE) {
	case MachineJumpTableInfo::EK_BlockAddress:
	// jmpq *(BReg,IReg64,8)
	BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
	.addReg(BReg)
	.addImm(8)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	break;
	case MachineJumpTableInfo::EK_LabelDifference32: {
	unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

	// movl (BReg,IReg64,4), OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
	.addReg(BReg)
	.addImm(4)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	// movsx OReg64, OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
	// addq BReg, OReg64, TReg
	BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
	.addReg(OReg64)
	.addReg(BReg);
	// jmpq *TReg
	BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
	break;
	}
	default:
	llvm_unreachable("Unexpected jump table encoding");
	}
	} else {
	// jmpl *.LJTI0_0(,IReg,4)
	BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
	.addReg(0)
	.addImm(4)
	.addReg(IReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
	unsigned Reg = SavedRegs[RI];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	case X86::RETPOLINE_CALL32:
	case X86::RETPOLINE_CALL64:
	case X86::RETPOLINE_TCRETURN32:
	case X86::RETPOLINE_TCRETURN64:
	return EmitLoweredRetpoline(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR32X:
	case X86::CMOV_FR64:
	case X86::CMOV_FR64X:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the EFLAGS and DF registers without them being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
	"Unexpected register in operand!");
	Push->getOperand(2).setIsUndef();
	assert(Push->getOperand(3).getReg() == X86::DF &&
	"Unexpected register in operand!");
	Push->getOperand(3).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

	// Load the old value of the control word...
	unsigned OldCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
	OrigCWFrameIdx);

	// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
	unsigned NewCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
	.addReg(OldCW, RegState::Kill).addImm(0xC00);

	// Extract to 16 bits.
	unsigned NewCW16 =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
	.addReg(NewCW, RegState::Kill, X86::sub_16bit);

	// Prepare memory for FLDCW.
	int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
	NewCWFrameIdx)
	.addReg(NewCW16, RegState::Kill);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), NewCWFrameIdx);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	return emitXRayCustomEvent(MI, BB);

	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
	return emitXRayTypedEvent(MI, BB);

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::iterator MBBI(MI);
	while (MBBI->definesRegister(X86::EAX) \|\| MBBI->definesRegister(X86::EBX) \|\|
	MBBI->definesRegister(X86::ECX) \|\| MBBI->definesRegister(X86::EDX))
	--MBBI;
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	bool
	X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
	const APInt &Demanded,
	TargetLoweringOpt &TLO) const {
	// Only optimize Ands to prevent shrinking a constant that could be
	// matched by movzx.
	if (Op.getOpcode() != ISD::AND)
	return false;

	EVT VT = Op.getValueType();

	// Ignore vectors.
	if (VT.isVector())
	return false;

	unsigned Size = VT.getSizeInBits();

	// Make sure the RHS really is a constant.
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;

	const APInt &Mask = C->getAPIntValue();

	// Clear all non-demanded bits initially.
	APInt ShrunkMask = Mask & Demanded;

	// Find the width of the shrunk mask.
	unsigned Width = ShrunkMask.getActiveBits();

	// If the mask is all 0s there's nothing to do here.
	if (Width == 0)
	return false;

	// Find the next power of 2 width, rounding up to a byte.
	Width = PowerOf2Ceil(std::max(Width, 8U));
	// Truncate the width to size to handle illegal types.
	Width = std::min(Width, Size);

	// Calculate a possible zero extend mask for this constant.
	APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);

	// If we aren't changing the mask, just return true to keep it and prevent
	// the caller from optimizing.
	if (ZeroExtendMask == Mask)
	return true;

	// Make sure the new mask can be represented by a combination of mask bits
	// and non-demanded bits.
	if (!ZeroExtendMask.isSubsetOf(Mask \| ~Demanded))
	return false;

	// Replace the constant with the zero extend mask.
	SDLoc DL(Op);
	SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
	SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	}

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
	Op.getConstantOperandVal(1));
	Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
	Known = Known.zextOrTrunc(BitWidth, false);
	Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
	break;
	}
	case X86ISD::VSRAI:
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
	Known.setAllZero();
	break;
	}

	Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	unsigned ShAmt = ShiftImm->getZExtValue();
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else if (Opc == X86ISD::VSRLI) {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	} else {
	Known.Zero.ashrInPlace(ShAmt);
	Known.One.ashrInPlace(ShAmt);
	}
	}
	break;
	}
	case X86ISD::PACKUS: {
	// PACKUS is just a truncation if the upper half is zero.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	Known.One = APInt::getAllOnesValue(BitWidth * 2);
	Known.Zero = APInt::getAllOnesValue(BitWidth * 2);

	KnownBits Known2;
	if (!!DemandedLHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	if (!!DemandedRHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}

	if (Known.countMinLeadingZeros() < BitWidth)
	Known.resetAll();
	Known = Known.trunc(BitWidth);
	break;
	}
	case X86ISD::ANDNP: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// ANDNP = (~X & Y);
	Known.One &= Known2.Zero;
	Known.Zero \|= Known2.One;
	break;
	}
	case X86ISD::FOR: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	}
	case X86ISD::CMOV: {
	Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opc)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	Known.resetAll();
	break;
	} else if (M == SM_SentinelZero) {
	Known.One.clearAllBits();
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	Known.resetAll();
	break;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	// Known bits are the values that are shared by every demanded element.
	for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
	if (!DemandedOps[i])
	continue;
	KnownBits Known2 =
	DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	}
	}
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VTRUNC: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned NumSrcBits = Src.getScalarValueSizeInBits();
	assert(VTBits < NumSrcBits && "Illegal truncation input type");
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	if (Tmp > (NumSrcBits - VTBits))
	return Tmp - (NumSrcBits - VTBits);
	return 1;
	}

	case X86ISD::PACKSS: {
	// PACKSS is just a truncation if the sign bits extend to the packed size.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
	DemandedRHS);

	unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
	if (!!DemandedLHS)
	Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	if (!!DemandedRHS)
	Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	unsigned Tmp = std::min(Tmp0, Tmp1);
	if (Tmp > (SrcBits - VTBits))
	return Tmp - (SrcBits - VTBits);
	return 1;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits - 1))
	return VTBits; // Sign splat.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;

	case X86ISD::ANDNP: {
	unsigned Tmp0 =
	DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
	return std::min(Tmp0, Tmp1);
	}

	case X86ISD::CMOV: {
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
	return std::min(Tmp0, Tmp1);
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opcode)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	return 1;
	} else if (M == SM_SentinelZero) {
	// Zero = all sign bits.
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	return 1;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	unsigned Tmp0 = VTBits;
	for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
	if (!DemandedOps[i])
	continue;
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
	Tmp0 = std::min(Tmp0, Tmp1);
	}
	return Tmp0;
	}
	}
	}

	// Fallback case.
	return 1;
	}

	SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == X86ISD::Wrapper \|\| N->getOpcode() == X86ISD::WrapperRIP)
	return N->getOperand(0);
	return N;
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, unsigned &Shuffle,
	MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
	if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool MatchAny = true;
	bool MatchZero = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && (MatchAny \|\| MatchZero); ++i) {
	if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
	MatchAny = MatchZero = false;
	break;
	}
	MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
	MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (MatchAny \|\| MatchZero) {
	assert(MatchZero && "Failed to match zext but matched aext?");
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
	MVT::getIntegerVT(MaskEltSize);
	SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

	if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);

	Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
	if (SrcVT.getVectorNumElements() != NumDstElts)
	Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);

	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

	bool ContainsZeros =
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	scaleShuffleMask<int>(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(Mask.data() + 0, 4);
	ArrayRef<int> HiMask(Mask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	// FIXME: Add 512-bit support.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
	Mask, 0, Zeroable, Subtarget);
	if (0 < ShiftAmt) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
	if (((MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) \|\|
	((MaskVT == MVT::v16i16 \|\| MaskVT == MVT::v32i8) && Subtarget.hasInt256()) \|\|
	((MaskVT == MVT::v32i16 \|\| MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
	if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
	Subtarget)) {
	DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
	DAG, Subtarget)) {
	SrcVT = DstVT = MaskVT;
	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
	SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteShuffle(
	MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
	const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
	BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector()) {
	if (Zeroable.getBoolValue() &&
	matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	return false;
	}

	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget);

	/// Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask,
	bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	return DAG.getBitcast(RootVT, V1);
	}

	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.isFloatingPoint() && Depth >= 2) \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	// TODO - this currently prevents all lane shuffles from occurring.
	// TODO - check for writemasks usage instead of always preventing combining.
	// TODO - attempt to narrow Mask back to writemask size.
	bool IsEVEXShuffle =
	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);

	// Attempt to match a subvector broadcast.
	// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
	if (UnaryShuffle &&
	(BaseMaskEltSizeInBits == 128 \|\| BaseMaskEltSizeInBits == 256)) {
	SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
	if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
	SDValue Src = Inputs[0];
	if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(0).isUndef() &&
	Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
	MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
	return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
	Src.getValueType(),
	Src.getOperand(1)));
	}
	}
	}

	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

	// Handle 128-bit lane shuffles of 256-bit vectors.
	// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
	// we need to use the zeroing feature.
	// TODO - this should support binary shuffles.
	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
	!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
	return SDValue(); // Nothing to do!
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getConstant(PermMask, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
	} else {
	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	// TODO: Should we indicate which domain is preferred if both are allowed?
	bool AllowFloatDomain = FloatDomain \|\| (Depth > 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth > 3)) && Subtarget.hasSSE2() &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt Zeroable(NumMaskElts, 0);
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (isUndefOrZero(Mask[i]))
	Zeroable.setBit(i);

	if (UnaryShuffle) {
	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
	// directly if we don't shuffle the lower element and we shuffle the upper
	// (zero) elements within themselves.
	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
	(cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
	MaskEltSizeInBits) == 0) {
	unsigned Scale =
	cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
	MaskEltSizeInBits;
	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
	return DAG.getBitcast(RootVT, V1);
	}
	}

	// Attempt to match against broadcast-from-vector.
	// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
	if ((Subtarget.hasAVX2() \|\| (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
	&& (!IsEVEXShuffle \|\| NumRootElts == NumMaskElts)) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	if (V1.getValueType() == MaskVT &&
	V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	MayFoldLoad(V1.getOperand(0))) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = V1.getOperand(0);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	if (Subtarget.hasAVX2()) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	}
	}

	SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
	PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	SDValue NewV1 = V1; // Save operands in case early exit happens.
	SDValue NewV2 = V2;
	if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT, UnaryShuffle) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
	return DAG.getBitcast(RootVT, Res);
	}

	NewV1 = V1; // Save operands in case early exit happens.
	NewV2 = V2;
	if (matchBinaryPermuteShuffle(
	MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	V2 = DAG.getBitcast(IntMaskVT, V2);
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 2)
	return SDValue();

	// Depth threshold above which we can efficiently use variable mask shuffles.
	int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
	AllowVariableMask &= (Depth >= VariableShuffleDepth) \|\| HasVariableMask;

	bool MaskContainsZeros =
	any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && AllowVariableMask &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if (AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}
	return SDValue();
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	Res = DAG.getBitcast(MaskVT, V1);
	unsigned AndOpcode =
	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if (AllowVariableMask && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getConstant(M2ZImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && AllowVariableMask &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	V2 = DAG.getBitcast(ByteVT, V2);
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input shuffle then lower to VPERMV3.
	if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v2i64 \|\| MaskVT == MVT::v4f64 \|\|
	MaskVT == MVT::v4i64 \|\| MaskVT == MVT::v4f32 \|\| MaskVT == MVT::v4i32 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i16)) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v16i8 \|\| MaskVT == MVT::v32i8)))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}

	// Failed to find any combines.
	return SDValue();
	}

	// Combine an arbitrary chain of shuffles + extract_subvectors into a single
	// instruction if possible.
	//
	// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
	// type size to attempt to combine:
	// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
	// -->
	// extract_subvector(shuffle(x,y,m2),0)
	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumMaskElts = BaseMask.size();
	unsigned NumInputs = Inputs.size();
	if (NumInputs == 0)
	return SDValue();

	SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
	SmallVector<unsigned, 4> Offsets(NumInputs, 0);

	// Peek through subvectors.
	// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
	unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
	for (unsigned i = 0; i != NumInputs; ++i) {
	SDValue &Src = WideInputs[i];
	unsigned &Offset = Offsets[i];
	Src = peekThroughBitcasts(Src);
	EVT BaseVT = Src.getValueType();
	while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(Src.getOperand(1))) {
	Offset += Src.getConstantOperandVal(1);
	Src = Src.getOperand(0);
	}
	WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
	assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
	"Unexpected subvector extraction");
	Offset /= BaseVT.getVectorNumElements();
	Offset *= NumMaskElts;
	}

	// Bail if we're always extracting from the lowest subvectors,
	// combineX86ShuffleChain should match this for the current width.
	if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
	return SDValue();

	EVT RootVT = Root.getValueType();
	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned Scale = WideSizeInBits / RootSizeInBits;
	assert((WideSizeInBits % RootSizeInBits) == 0 &&
	"Unexpected subvector extraction");

	// If the src vector types aren't the same, see if we can extend
	// them to match each other.
	// TODO: Support different scalar types?
	EVT WideSVT = WideInputs[0].getValueType().getScalarType();
	if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
	return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) \|\|
	Op.getValueType().getScalarType() != WideSVT;
	}))
	return SDValue();

	for (SDValue &NewInput : WideInputs) {
	assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
	"Shuffle vector size mismatch");
	if (WideSizeInBits > NewInput.getValueSizeInBits())
	NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
	SDLoc(NewInput), WideSizeInBits);
	assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
	"Unexpected subvector extraction");
	}

	// Create new mask for larger type.
	for (unsigned i = 1; i != NumInputs; ++i)
	Offsets[i] += i * Scale * NumMaskElts;

	SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
	for (int &M : WideMask) {
	if (M < 0)
	continue;
	M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
	}
	WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
	assert(!WideInputs.empty() && "Shuffle with no inputs detected");

	if (WideInputs.size() > 2)
	return SDValue();

	// Increase depth for every upper subvector we've peeked through.
	Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });

	// Attempt to combine wider chain.
	// TODO: Can we use a better Root?
	SDValue WideRoot = WideInputs[0];
	if (SDValue WideShuffle = combineX86ShuffleChain(
	WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget)) {
	WideShuffle =
	extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
	return DAG.getBitcast(RootVT, WideShuffle);
	}
	return SDValue();
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return SDValue();
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return SDValue();

	// Shuffle the constant bits according to the mask.
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

	SDLoc DL(Root);
	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	return DAG.getBitcast(VT, CstOp);
	}

	/// Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static SDValue combineX86ShufflesRecursively(
	ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	const unsigned MaxRecursionDepth = 8;
	if (Depth > MaxRecursionDepth)
	return SDValue();

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return SDValue(); // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
	return SDValue();

	// Add the inputs to the Ops list, avoiding duplicates.
	SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

	auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
	// Attempt to find an existing match.
	SDValue InputBC = peekThroughBitcasts(Input);
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (InputBC == peekThroughBitcasts(Ops[i]))
	return i;
	// Match failed - should we replace an existing Op?
	if (InsertionPoint >= 0) {
	Ops[InsertionPoint] = Input;
	return InsertionPoint;
	}
	// Add to the end of the Ops list.
	Ops.push_back(Input);
	return Ops.size() - 1;
	};

	SmallVector<int, 2> OpInputIdx;
	for (SDValue OpInput : OpInputs)
	OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by the
	// root mask to get us all the way to the root value arrangement. The reason
	// for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx =
	OpRatio == 1
	? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
	assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
	OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

	Mask[i] = OpMaskedIdx;
	}

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
	return DAG.getUNDEF(Root.getValueType());

	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	if (all_of(Mask, [](int Idx) { return Idx < 0; }))
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
	SDLoc(Root));

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);
	assert(!Ops.empty() && "Shuffle with no inputs detected");

	HasVariableMask \|= isTargetShuffleVariableMask(Op.getOpcode());

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be generally combined if it either has
	// a single use (i.e. current Op) or all its users have already been combined,
	// if not then we can still combine but should prevent generation of variable
	// shuffles to avoid constant pool bloat.
	// Don't recurse if we already have more source ops than we can combine in
	// the remaining recursion depth.
	if (Ops.size() < (MaxRecursionDepth - Depth)) {
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	bool AllowVar = false;
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	AllowVar = AllowVariableMask;
	if (SDValue Res = combineX86ShufflesRecursively(
	Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
	AllowVar, DAG, Subtarget))
	return Res;
	}
	}

	// Attempt to constant fold all of the constant source ops.
	if (SDValue Cst = combineX86ShufflesConstants(
	Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
	return Cst;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() <= 2) {
	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	// Finally, try to combine into a single shuffle instruction.
	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget);
	}

	// If that failed and any input is extracted then try to combine as a
	// shuffle with the larger type.
	return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
	HasVariableMask, AllowVariableMask,
	DAG, Subtarget);
	}

	/// Helper entry wrapper to combineX86ShufflesRecursively.
	static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false,
	/AllowVarMask/ true, DAG, Subtarget);
	}

	/// Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	/// Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	unsigned Opcode = N.getOpcode();

	// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
	// single instruction.
	if (VT.getScalarSizeInBits() == 64 &&
	(Opcode == X86ISD::MOVSD \|\| Opcode == X86ISD::UNPCKH \|\|
	Opcode == X86ISD::UNPCKL)) {
	auto BC0 = peekThroughBitcasts(N.getOperand(0));
	auto BC1 = peekThroughBitcasts(N.getOperand(1));
	EVT VT0 = BC0.getValueType();
	EVT VT1 = BC1.getValueType();
	unsigned Opcode0 = BC0.getOpcode();
	unsigned Opcode1 = BC1.getOpcode();
	if (Opcode0 == Opcode1 && VT0 == VT1 &&
	(Opcode0 == X86ISD::FHADD \|\| Opcode0 == X86ISD::HADD \|\|
	Opcode0 == X86ISD::FHSUB \|\| Opcode0 == X86ISD::HSUB \|\|
	Opcode0 == X86ISD::PACKSS \|\| Opcode0 == X86ISD::PACKUS)) {
	SDValue Lo, Hi;
	if (Opcode == X86ISD::MOVSD) {
	Lo = BC1.getOperand(0);
	Hi = BC0.getOperand(1);
	} else {
	Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	}
	SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
	return DAG.getBitcast(VT, Horiz);
	}
	}

	switch (Opcode) {
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	SDValue BC = peekThroughBitcasts(Src);
	EVT SrcVT = Src.getValueType();
	EVT BCVT = BC.getValueType();

	// If broadcasting from another shuffle, attempt to simplify it.
	// TODO - we really need a general SimplifyDemandedVectorElts mechanism.
	if (isTargetShuffle(BC.getOpcode()) &&
	VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
	unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
	SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
	SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i)
	DemandedMask[i] = i;
	if (SDValue Res = combineX86ShufflesRecursively(
	{BC}, 0, BC, DemandedMask, {}, /Depth/ 1,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getBitcast(SrcVT, Res));
	}

	// broadcast(bitcast(src)) -> bitcast(broadcast(src))
	// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
	if (Src.getOpcode() == ISD::BITCAST &&
	SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
	VT.getVectorNumElements());
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
	}

	// Reduce broadcast source vector to lowest 128-bits.
	if (SrcVT.getSizeInBits() > 128)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	extract128BitVector(Src, 0, DAG, DL));

	// broadcast(scalar_to_vector(x)) -> broadcast(x).
	if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

	// Share broadcast with the longest vector and extract low subvector (free).
	for (SDNode *User : Src->uses())
	if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
	User->getValueSizeInBits(0) > VT.getSizeInBits()) {
	return extractSubVector(SDValue(User, 0), 0, DAG, DL,
	VT.getSizeInBits());
	}

	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
	// TODO: Handle MVT::v16i16 repeated blend mask.
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
	MVT SrcVT = N0.getOperand(0).getSimpleValueType();
	if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
	SrcVT.getScalarSizeInBits() >= 32) {
	unsigned Mask = N.getConstantOperandVal(2);
	unsigned Size = VT.getVectorNumElements();
	unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
	unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
	N1.getOperand(0),
	DAG.getConstant(ScaleMask, DL, MVT::i8)));
	}
	}
	return SDValue();
	}
	case X86ISD::VPERMI: {
	// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
	// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	if (N0.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
	return DAG.getBitcast(VT, Res);
	}
	return SDValue();
	}
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// Canonicalize scalar FPOps:
	// MOVS(N0, OP(N0, N1)) --> MOVS(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
	// If commutable, allow OP(N1[0], N0[0]).
	unsigned Opcode1 = N1.getOpcode();
	if (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL \|\| Opcode1 == ISD::FSUB \|\|
	Opcode1 == ISD::FDIV) {
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);
	if (N10 == N0 \|\|
	(N11 == N0 && (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL))) {
	if (N10 != N0)
	std::swap(N10, N11);
	MVT SVT = VT.getVectorElementType();
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
	N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
	N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
	SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
	SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	return DAG.getNode(Opcode, DL, VT, N0, SclVec);
	}
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	SDValue Op2 = N.getOperand(2);
	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
	int M = TargetMask1[SrcIdx];
	if (isUndefOrZero(M)) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
	return SDValue();

	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	int M = TargetMask0[i];
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (isUndefOrZero(M)) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return N.getOperand(0);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Checks if the shuffle mask takes subsequent elements
	/// alternately from two vectors.
	/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
	static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

	int ParitySrc[2] = {-1, -1};
	unsigned Size = Mask.size();
	for (unsigned i = 0; i != Size; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Make sure we are using the matching element from the input.
	if ((M % Size) != i)
	return false;

	// Make sure we use the same input for all elements of the same parity.
	int Src = M / Size;
	if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
	return false;
	ParitySrc[i % 2] = Src;
	}

	// Make sure each input is used.
	if (ParitySrc[0] < 0 \|\| ParitySrc[1] < 0 \|\| ParitySrc[0] == ParitySrc[1])
	return false;

	Op0Even = ParitySrc[0] == 0;
	return true;
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
	/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
	bool &IsSubAdd) {

	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasSSE3() \|\| !TLI.isTypeLegal(VT) \|\|
	!VT.getSimpleVT().isFloatingPoint())
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	// Make sure we have an FADD and an FSUB.
	if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) \|\|
	(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) \|\|
	V1.getOpcode() == V2.getOpcode())
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS, RHS;
	if (V1.getOpcode() == ISD::FSUB) {
	LHS = V1->getOperand(0); RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;
	} else {
	assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
	LHS = V2->getOperand(0); RHS = V2->getOperand(1);
	if ((V1->getOperand(0) != LHS \|\| V1->getOperand(1) != RHS) &&
	(V1->getOperand(0) != RHS \|\| V1->getOperand(1) != LHS))
	return false;
	}

	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return false;

	// It's a subadd if the vector in the even parity is an FADD.
	IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
	: V2->getOpcode() == ISD::FADD;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
	static SDValue combineShuffleToFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasAnyFMA() \|\| !TLI.isTypeLegal(VT))
	return SDValue();

	// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDValue FMAdd = Op0, FMSub = Op1;
	if (FMSub.getOpcode() != X86ISD::FMSUB)
	std::swap(FMAdd, FMSub);

	if (FMAdd.getOpcode() != ISD::FMA \|\| FMSub.getOpcode() != X86ISD::FMSUB \|\|
	FMAdd.getOperand(0) != FMSub.getOperand(0) \|\| !FMAdd.hasOneUse() \|\|
	FMAdd.getOperand(1) != FMSub.getOperand(1) \|\| !FMSub.hasOneUse() \|\|
	FMAdd.getOperand(2) != FMSub.getOperand(2))
	return SDValue();

	// Check for correct shuffle mask.
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return SDValue();

	// FMAddSub takes zeroth operand from FMSub node.
	SDLoc DL(N);
	bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
	unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
	FMAdd.getOperand(2));
	}

	/// Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
	return V;

	SDValue Opnd0, Opnd1;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	/// Eliminate a redundant shuffle of a horizontal math op.
	static SDValue foldShuffleOfHorizOp(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
	if (Opcode != ISD::VECTOR_SHUFFLE \|\| !N->getOperand(1).isUndef())
	return SDValue();

	// For a broadcast, peek through an extract element of index 0 to find the
	// horizontal op: broadcast (ext_vec_elt HOp, 0)
	EVT VT = N->getValueType(0);
	if (Opcode == X86ISD::VBROADCAST) {
	SDValue SrcOp = N->getOperand(0);
	if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	SrcOp.getValueType() == MVT::f64 &&
	SrcOp.getOperand(0).getValueType() == VT &&
	isNullConstant(SrcOp.getOperand(1)))
	N = SrcOp.getNode();
	}

	SDValue HOp = N->getOperand(0);
	if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
	HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
	return SDValue();

	// 128-bit horizontal math instructions are defined to operate on adjacent
	// lanes of each operand as:
	// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
	// ...similarly for v2f64 and v8i16.
	if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
	HOp.getOperand(0) != HOp.getOperand(1))
	return SDValue();

	// When the operands of a horizontal math op are identical, the low half of
	// the result is the same as the high half. If a target shuffle is also
	// replicating low and high halves, we don't need the shuffle.
	if (Opcode == X86ISD::MOVDDUP \|\| Opcode == X86ISD::VBROADCAST) {
	if (HOp.getScalarValueSizeInBits() == 64) {
	// movddup (hadd X, X) --> hadd X, X
	// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
	assert((HOp.getValueType() == MVT::v2f64 \|\|
	HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
	"Unexpected type for h-op");
	return HOp;
	}
	return SDValue();
	}

	// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
	// but this should be tied to whatever horizontal op matching and shuffle
	// canonicalization are producing.
	if (HOp.getValueSizeInBits() == 128 &&
	(isTargetShuffleEquivalent(Mask, {0, 0}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
	return HOp;

	if (HOp.getValueSizeInBits() == 256 &&
	(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) \|\|
	isTargetShuffleEquivalent(
	Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
	return HOp;

	return SDValue();
	}

	/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
	/// low half of each source vector and does not set any high half elements in
	/// the destination vector, narrow the shuffle to half its original size.
	static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
	if (!Shuf->getValueType(0).isSimple())
	return SDValue();
	MVT VT = Shuf->getSimpleValueType(0);
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// See if we can ignore all of the high elements of the shuffle.
	ArrayRef<int> Mask = Shuf->getMask();
	if (!isUndefUpperHalf(Mask))
	return SDValue();

	// Check if the shuffle mask accesses only the low half of each input vector
	// (half-index output is 0 or 2).
	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(Mask.size() / 2);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) \|\|
	(HalfIdx1 % 2 == 1) \|\| (HalfIdx2 % 2 == 1))
	return SDValue();

	// Create a half-width shuffle to replace the unnecessarily wide shuffle.
	// The trick is knowing that all of the insert/extract are actually free
	// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
	// of narrow inputs into a narrow output, and that is always cheaper than
	// the wide shuffle that we started with.
	return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
	Shuf->getOperand(1), HalfMask, HalfIdx1,
	HalfIdx2, false, DAG);
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
	if (SDValue V = narrowShuffle(Shuf, DAG))
	return V;

	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT)) {
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	if (SDValue HAddSub = foldShuffleOfHorizOp(N))
	return HAddSub;
	}

	// During Type Legalization, when promoting illegal vector types,
	// the backend might introduce new shuffle dag nodes and bitcasts.
	//
	// This code performs the following transformation:
	// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
	// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
	//
	// We do this only if both the bitcast and the BINOP dag nodes have
	// one use. Also, perform this transformation only if the new binary
	// operation is legal. This is to avoid introducing dag nodes that
	// potentially need to be further expanded (or custom lowered) into a
	// less optimal sequence of dag nodes.
	if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	SDValue BC0 = N0.getOperand(0);
	EVT SVT = BC0.getValueType();
	unsigned Opcode = BC0.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	if (BC0.hasOneUse() && SVT.isVector() &&
	SVT.getVectorNumElements() * 2 == NumElts &&
	TLI.isOperationLegal(Opcode, VT)) {
	bool CanFold = false;
	switch (Opcode) {
	default : break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	// isOperationLegal lies for integer ops on floating point types.
	CanFold = VT.isInteger();
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	// isOperationLegal lies for floating point ops on integer types.
	CanFold = VT.isFloatingPoint();
	break;
	}

	unsigned SVTNumElts = SVT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
	for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) < 0;

	if (CanFold) {
	SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
	SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
	SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
	return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
	}
	}
	}

	// Attempt to combine into a vector load/broadcast.
	if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	// Simplify source operands based on shuffle mask.
	// TODO - merge this into combineX86ShufflesRecursively.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
	return SDValue(N, 0);
	}

	// Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
	// in the upper 64 bits.
	// TODO: Can we generalize this using computeKnownBits.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
	(VT == MVT::v2f64 \|\| VT == MVT::v2i64) &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	(N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 \|\|
	N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
	SDValue In = N->getOperand(0).getOperand(0);
	switch (In.getOpcode()) {
	default:
	break;
	case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
	case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
	case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
	case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
	case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
	case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
	case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
	if (In.getOperand(0).getValueType() == MVT::v2f64 \|\|
	In.getOperand(0).getValueType() == MVT::v2i64)
	return N->getOperand(0); // return the bitcast
	break;
	}
	}

	// Pull subvector inserts into undef through VZEXT_MOVL by making it an
	// insert into a zero vector. This helps get VZEXT_MOVL closer to
	// scalar_to_vectors where 256/512 are canonicalized to an insert and a
	// 128-bit scalar_to_vector. This reduces the number of isel patterns.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
	N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
	N->getOperand(0).hasOneUse() &&
	N->getOperand(0).getOperand(0).isUndef() &&
	isNullConstant(N->getOperand(0).getOperand(2))) {
	SDValue In = N->getOperand(0).getOperand(1);
	SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
	getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
	Movl, N->getOperand(0).getOperand(2));
	}

	// If this a vzmovl of a full vector load, replace it with a vzload, unless
	// the load is volatile.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
	ISD::isNormalLoad(N->getOperand(0).getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	if (!LN->isVolatile()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	VT.getVectorElementType(),
	LN->getPointerInfo(),
	LN->getAlignment(),
	MachineMemOperand::MOLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return VZLoad;
	}
	}


	// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
	// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
	// FIXME: This can probably go away once we default to widening legalization.
	if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
	SDValue BC = N->getOperand(0);
	SDValue MULUDQ = BC.getOperand(0);
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	ArrayRef<int> Mask = SVOp->getMask();
	if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
	Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
	SDValue Op0 = MULUDQ.getOperand(0);
	SDValue Op1 = MULUDQ.getOperand(1);
	if (Op0.getOpcode() == ISD::BITCAST &&
	Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
	Op0.getOperand(0).getValueType() == MVT::v4i32) {
	ShuffleVectorSDNode *SVOp0 =
	cast<ShuffleVectorSDNode>(Op0.getOperand(0));
	ArrayRef<int> Mask2 = SVOp0->getMask();
	if (Mask2[0] == 0 && Mask2[1] == -1 &&
	Mask2[2] == 1 && Mask2[3] == -1) {
	Op0 = SVOp0->getOperand(0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);
	Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
	return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
	}
	}
	if (Op1.getOpcode() == ISD::BITCAST &&
	Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
	Op1.getOperand(0).getValueType() == MVT::v4i32) {
	ShuffleVectorSDNode *SVOp1 =
	cast<ShuffleVectorSDNode>(Op1.getOperand(0));
	ArrayRef<int> Mask2 = SVOp1->getMask();
	if (Mask2[0] == 0 && Mask2[1] == -1 &&
	Mask2[2] == 1 && Mask2[3] == -1) {
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
	Op1 = SVOp1->getOperand(0);
	return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
	}
	}
	}
	}

	return SDValue();
	}

	bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
	SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
	TargetLoweringOpt &TLO, unsigned Depth) const {
	int NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();

	// Handle special case opcodes.
	switch (Opc) {
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	APInt LHSUndef, LHSZero;
	APInt RHSUndef, RHSZero;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
	Depth + 1))
	return true;
	// Multiply by zero.
	KnownZero = LHSZero \| RHSZero;
	break;
	}
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA: {
	// We only need the bottom 64-bits of the (128-bit) shift amount.
	SDValue Amt = Op.getOperand(1);
	MVT AmtVT = Amt.getSimpleValueType();
	assert(AmtVT.is128BitVector() && "Unexpected value type");

	// If we reuse the shift amount just for sse shift amounts then we know that
	// only the bottom 64-bits are only ever used.
	bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
	unsigned UseOpc = Use->getOpcode();
	return (UseOpc == X86ISD::VSHL \|\| UseOpc == X86ISD::VSRL \|\|
	UseOpc == X86ISD::VSRA) &&
	Use->getOperand(0) != Amt;
	});

	APInt AmtUndef, AmtZero;
	unsigned NumAmtElts = AmtVT.getVectorNumElements();
	APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
	if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
	Depth + 1, AssumeSingleUse))
	return true;
	LLVM_FALLTHROUGH;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt SrcUndef;
	if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
	Depth + 1))
	return true;
	// TODO convert SrcUndef to KnownUndef.
	break;
	}
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt SrcUndef, SrcZero;
	APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
	SrcZero, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
	SrcZero, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	APInt DemandedLHS, DemandedRHS;
	getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;
	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	KnownZero = SrcZero.zextOrTrunc(NumElts);
	KnownUndef = SrcUndef.zextOrTrunc(NumElts);
	break;
	}
	case X86ISD::BLENDV: {
	APInt SelUndef, SelZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
	SelZero, TLO, Depth + 1))
	return true;

	// TODO: Use SelZero to adjust LHS/RHS DemandedElts.
	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;

	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;

	KnownZero = LHSZero & RHSZero;
	KnownUndef = LHSUndef & RHSUndef;
	break;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (!SrcVT.isVector())
	return false;
	// Don't bother broadcasting if we just need the 0'th element.
	if (DemandedElts == 1) {
	if (Src.getValueType() != VT)
	Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
	SDLoc(Op));
	return TLO.CombineTo(Op, Src);
	}
	APInt SrcUndef, SrcZero;
	APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::VPERMV: {
	SDValue Mask = Op.getOperand(0);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::PSHUFB:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMILPV: {
	SDValue Mask = Op.getOperand(1);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::VPPERM:
	case X86ISD::VPERMIL2: {
	SDValue Mask = Op.getOperand(2);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	}

	// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
	// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
	// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	DemandedElts.lshr(NumElts / 2) == 0) {
	unsigned SizeInBits = VT.getSizeInBits();
	unsigned ExtSizeInBits = SizeInBits / 2;

	// See if 512-bit ops only use the bottom 128-bits.
	if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
	ExtSizeInBits = SizeInBits / 4;

	switch (Opc) {
	// Zero upper elements.
	case X86ISD::VZEXT_MOVL: {
	SDLoc DL(Op);
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp =
	TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	// Subvector broadcast.
	case X86ISD::SUBV_BROADCAST: {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	if (Src.getValueSizeInBits() > ExtSizeInBits)
	Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
	else if (Src.getValueSizeInBits() < ExtSizeInBits) {
	MVT SrcSVT = Src.getSimpleValueType().getScalarType();
	MVT SrcVT =
	MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
	Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
	}
	return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
	TLO.DAG, DL, ExtSizeInBits));
	}
	// Byte shifts by immediate.
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	// Shift by uniform.
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA:
	// Shift by immediate.
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDLoc DL(Op);
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp =
	TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	case X86ISD::VPERMI: {
	// Simplify PERMPD/PERMQ to extract_subvector.
	// TODO: This should be done in shuffle combining.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64) {
	SmallVector<int, 4> Mask;
	DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
	if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
	SDLoc DL(Op);
	SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
	return TLO.CombineTo(Op, Insert);
	}
	}
	break;
	}
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// Saturated Packs.
	case X86ISD::PACKSS:
	case X86ISD::PACKUS:
	// Horizontal Ops.
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	SDLoc DL(Op);
	MVT ExtVT = VT.getSimpleVT();
	ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
	ExtSizeInBits / ExtVT.getScalarSizeInBits());
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue Ext1 =
	extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	}
	}

	// Simplify target shuffles.
	if (!isTargetShuffle(Opc) \|\| !VT.isSimple())
	return false;

	// Get target shuffle mask.
	bool IsUnary;
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
	OpMask, IsUnary))
	return false;

	// Shuffle inputs must be the same type as the result.
	if (llvm::any_of(OpInputs,
	[VT](SDValue V) { return VT != V.getValueType(); }))
	return false;

	// Clear known elts that might have been set above.
	KnownZero.clearAllBits();
	KnownUndef.clearAllBits();

	// Check if shuffle mask can be simplified to undef/zero/identity.
	int NumSrcs = OpInputs.size();
	for (int i = 0; i != NumElts; ++i) {
	int &M = OpMask[i];
	if (!DemandedElts[i])
	M = SM_SentinelUndef;
	else if (0 <= M && OpInputs[M / NumElts].isUndef())
	M = SM_SentinelUndef;
	}

	if (isUndefInRange(OpMask, 0, NumElts)) {
	KnownUndef.setAllBits();
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
	}
	if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
	KnownZero.setAllBits();
	return TLO.CombineTo(
	Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
	}
	for (int Src = 0; Src != NumSrcs; ++Src)
	if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
	return TLO.CombineTo(Op, OpInputs[Src]);

	// Attempt to simplify inputs.
	for (int Src = 0; Src != NumSrcs; ++Src) {
	int Lo = Src * NumElts;
	APInt SrcElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	int M = OpMask[i] - Lo;
	if (0 <= M && M < NumElts)
	SrcElts.setBit(M);
	}

	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
	TLO, Depth + 1))
	return true;
	}

	// Extract known zero/undef elements.
	// TODO - Propagate input undef/zero elts.
	for (int i = 0; i != NumElts; ++i) {
	if (OpMask[i] == SM_SentinelUndef)
	KnownUndef.setBit(i);
	if (OpMask[i] == SM_SentinelZero)
	KnownZero.setBit(i);
	}

	return false;
	}

	bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
	SDValue Op, const APInt &OriginalDemandedBits,
	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned BitWidth = OriginalDemandedBits.getBitWidth();
	unsigned Opc = Op.getOpcode();
	switch(Opc) {
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	KnownBits KnownOp;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	// FIXME: Can we bound this better?
	APInt DemandedMask = APInt::getLowBitsSet(64, 32);
	if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::VSHLI: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
	if (ShiftImm->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = ShiftImm->getZExtValue();
	APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	if (Op0.getOpcode() == X86ISD::VSRLI &&
	OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
	if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {
	if (Shift2Imm->getAPIntValue().ult(BitWidth)) {
	int Diff = ShAmt - Shift2Imm->getZExtValue();
	if (Diff == 0)
	return TLO.CombineTo(Op, Op0.getOperand(0));

	unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
	SDValue NewShift = TLO.DAG.getNode(
	NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
	TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
	return TLO.CombineTo(Op, NewShift);
	}
	}
	}

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;

	// Low bits known zero.
	Known.Zero.setLowBits(ShAmt);
	}
	break;
	}
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = ShiftImm->getZExtValue();
	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
	OriginalDemandedElts, Known, TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// High bits known zero.
	Known.Zero.setHighBits(ShAmt);
	}
	break;
	}
	case X86ISD::VSRAI: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
	if (ShiftImm->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = ShiftImm->getZExtValue();
	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	// If we just want the sign bit then we don't need to shift it.
	if (OriginalDemandedBits.isSignMask())
	return TLO.CombineTo(Op, Op0);

	// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
	if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
	SDValue Op00 = Op0.getOperand(0);
	unsigned NumSignBits =
	TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
	if (ShAmt < NumSignBits)
	return TLO.CombineTo(Op, Op00);
	}

	// If any of the demanded bits are produced by the sign extension, we also
	// demand the input sign bit.
	if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
	DemandedMask.setSignBit();

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// If the input sign bit is known to be zero, or if none of the top bits
	// are demanded, turn this into an unsigned shift right.
	if (Known.Zero[BitWidth - ShAmt - 1] \|\|
	OriginalDemandedBits.countLeadingZeros() >= ShAmt)
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

	// High bits are known one.
	if (Known.One[BitWidth - ShAmt - 1])
	Known.One.setHighBits(ShAmt);
	}
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Vec = Op.getOperand(0);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumVecElts = VecVT.getVectorNumElements();

	if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
	unsigned Idx = CIdx->getZExtValue();
	unsigned VecBitWidth = VecVT.getScalarSizeInBits();

	// If we demand no bits from the vector then we must have demanded
	// bits from the implict zext - simplify to zero.
	APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
	if (DemandedVecBits == 0)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	APInt KnownUndef, KnownZero;
	APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
	if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
	KnownZero, TLO, Depth + 1))
	return true;

	KnownBits KnownVec;
	if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	Known = KnownVec.zext(BitWidth, true);
	return false;
	}
	break;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue Vec = Op.getOperand(0);
	SDValue Scl = Op.getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	MVT VecVT = Vec.getSimpleValueType();

	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
	unsigned Idx = CIdx->getZExtValue();
	if (!OriginalDemandedElts[Idx])
	return TLO.CombineTo(Op, Vec);

	KnownBits KnownVec;
	APInt DemandedVecElts(OriginalDemandedElts);
	DemandedVecElts.clearBit(Idx);
	if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	KnownBits KnownScl;
	unsigned NumSclBits = Scl.getScalarValueSizeInBits();
	APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
	if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
	return true;

	KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
	Known.One = KnownVec.One & KnownScl.One;
	Known.Zero = KnownVec.Zero & KnownScl.Zero;
	return false;
	}
	break;
	}
	case X86ISD::PACKSS:
	// PACKSS saturates to MIN/MAX integer values. So if we just want the
	// sign bit then we can just ask for the source operands sign bit.
	// TODO - add known bits handling.
	if (OriginalDemandedBits.isSignMask()) {
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

	KnownBits KnownLHS, KnownRHS;
	APInt SignMask = APInt::getSignMask(BitWidth * 2);
	if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
	KnownLHS, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
	KnownRHS, TLO, Depth + 1))
	return true;
	}
	// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
	break;
	case X86ISD::PCMPGT:
	// icmp sgt(0, R) == ashr(R, BitWidth-1).
	// iff we only need the sign bit then we can use R directly.
	if (OriginalDemandedBits.isSignMask() &&
	ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
	return TLO.CombineTo(Op, Op.getOperand(1));
	break;
	case X86ISD::MOVMSK: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	unsigned SrcBits = SrcVT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// If we don't need the sign bits at all just return zero.
	if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	// Only demand the vector elements of the sign bits we need.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
	if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
	TLO, Depth + 1))
	return true;

	Known.Zero = KnownZero.zextOrSelf(BitWidth);
	Known.Zero.setHighBits(BitWidth - NumElts);

	// MOVMSK only uses the MSB from each vector element.
	KnownBits KnownSrc;
	if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
	KnownSrc, TLO, Depth + 1))
	return true;

	if (KnownSrc.One[SrcBits - 1])
	Known.One.setLowBits(NumElts);
	else if (KnownSrc.Zero[SrcBits - 1])
	Known.Zero.setLowBits(NumElts);
	return false;
	}
	}

	return TargetLowering::SimplifyDemandedBitsForTargetNode(
	Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
	}

	/// Check if a vector extract from a target-specific shuffle of a load can be
	/// folded into a single element load.
	/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
	/// shuffles have been custom lowered so we need to handle those here.
	static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue InVec = N->getOperand(0);
	SDValue EltNo = N->getOperand(1);
	EVT EltVT = N->getValueType(0);

	if (!isa<ConstantSDNode>(EltNo))
	return SDValue();

	EVT OriginalVT = InVec.getValueType();

	// Peek through bitcasts, don't duplicate a load with other uses.
	InVec = peekThroughOneUseBitcasts(InVec);

	EVT CurrentVT = InVec.getValueType();
	if (!CurrentVT.isVector() \|\|
	CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
	return SDValue();

	if (!isTargetShuffle(InVec.getOpcode()))
	return SDValue();

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	bool UnaryShuffle;
	if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
	ShuffleOps, ShuffleMask, UnaryShuffle))
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = CurrentVT.getVectorNumElements();
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];

	if (Idx == SM_SentinelZero)
	return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
	if (Idx == SM_SentinelUndef)
	return DAG.getUNDEF(EltVT);

	// Bail if any mask element is SM_SentinelZero - getVectorShuffle below
	// won't handle it.
	if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
	return SDValue();

	assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
	SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];

	// If inputs to shuffle are the same for both ops, then allow 2 uses
	unsigned AllowedUses =
	(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

	if (LdNode.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
	return SDValue();

	AllowedUses = 1; // only allow 1 load use if we have a bitcast
	LdNode = LdNode.getOperand(0);
	}

	if (!ISD::isNormalLoad(LdNode.getNode()))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

	if (!LN0 \|\|!LN0->hasNUsesOfValue(AllowedUses, 0) \|\| LN0->isVolatile())
	return SDValue();

	// If there's a bitcast before the shuffle, check if the load type and
	// alignment is valid.
	unsigned Align = LN0->getAlignment();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	EltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
	return SDValue();

	// All checks match so transform back to vector_shuffle so that DAG combiner
	// can finish the job
	SDLoc dl(N);

	// Create shuffle node taking into account the case that its a unary shuffle
	SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
	Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
	ShuffleMask);
	Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
	EltNo);
	}

	// Helper to peek through bitops/setcc to determine size of source vector.
	// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
	static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
	switch (Src.getOpcode()) {
	case ISD::SETCC:
	return Src.getOperand(0).getValueSizeInBits() == Size;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
	checkBitcastSrcVectorSize(Src.getOperand(1), Size);
	}
	return false;
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
	const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isSimple() \|\| SrcVT.getScalarType() != MVT::i1)
	return SDValue();

	// If the input is a truncate from v16i8 or v32i8 go ahead and use a
	// movmskb even with avx512. This will be better than truncating to vXi1 and
	// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
	// vpcmpeqb/vpcmpgtb.
	bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
	(Src.getOperand(0).getValueType() == MVT::v16i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v32i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v64i8);

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (!Subtarget.hasSSE2() \|\| (Subtarget.hasAVX512() && !IsTruncated))
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	switch (SrcVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
	SExtVT = MVT::v4i64;
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	// TODO : use checkBitcastSrcVectorSize
	if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
	(Src.getOperand(0).getValueType().is256BitVector() \|\|
	Src.getOperand(0).getValueType().is512BitVector())) {
	SExtVT = MVT::v8i32;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	SExtVT = MVT::v32i8;
	break;
	case MVT::v64i1:
	// If we have AVX512F, but not AVX512BW and the input is truncated from
	// v64i8 checked earlier. Then split the input and make two pmovmskbs.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
	SExtVT = MVT::v64i8;
	break;
	}
	return SDValue();
	};

	SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

	if (SExtVT == MVT::v64i8) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
	DAG.getConstant(32, DL, MVT::i8));
	V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
	} else if (SExtVT == MVT::v16i8 \|\| SExtVT == MVT::v32i8) {
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	} else {
	if (SExtVT == MVT::v8i16)
	V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
	DAG.getUNDEF(MVT::v8i16));
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	EVT IntVT =
	EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
	V = DAG.getZExtOrTrunc(V, DL, IntVT);
	return DAG.getBitcast(VT, V);
	}

	// Convert a vXi1 constant build vector to the same width scalar integer.
	static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
	EVT SrcVT = Op.getValueType();
	assert(SrcVT.getVectorElementType() == MVT::i1 &&
	"Expected a vXi1 vector");
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	"Expected a constant build vector");

	APInt Imm(SrcVT.getVectorNumElements(), 0);
	for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
	SDValue In = Op.getOperand(Idx);
	if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
	Imm.setBit(Idx);
	}
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
	return DAG.getConstant(Imm, SDLoc(Op), IntVT);
	}

	static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// Only do this if we have k-registers.
	if (!Subtarget.hasAVX512())
	return SDValue();

	EVT DstVT = N->getValueType(0);
	SDValue Op = N->getOperand(0);
	EVT SrcVT = Op.getValueType();

	if (!Op.hasOneUse())
	return SDValue();

	// Look for logic ops.
	if (Op.getOpcode() != ISD::AND &&
	Op.getOpcode() != ISD::OR &&
	Op.getOpcode() != ISD::XOR)
	return SDValue();

	// Make sure we have a bitcast between mask registers and a scalar type.
	if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	DstVT.isScalarInteger()) &&
	!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
	SrcVT.isScalarInteger()))
	return SDValue();

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);

	if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
	LHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
	DAG.getBitcast(DstVT, RHS));

	if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
	RHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

	// If the RHS is a vXi1 build vector, this is a good reason to flip too.
	// Most of these have to move a constant from the scalar domain anyway.
	if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
	RHS = combinevXi1ConstantToInteger(RHS, DAG);
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS);
	}

	return SDValue();
	}

	static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(BV);
	unsigned NumElts = BV->getNumOperands();
	SDValue Splat = BV->getSplatValue();

	// Build MMX element from integer GPR or SSE float values.
	auto CreateMMXElement = [&](SDValue V) {
	if (V.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);
	if (V.getValueType().isFloatingPoint()) {
	if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
	V = DAG.getBitcast(MVT::v2i64, V);
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
	}
	V = DAG.getBitcast(MVT::i32, V);
	} else {
	V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
	}
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
	};

	// Convert build vector ops to MMX data in the bottom elements.
	SmallVector<SDValue, 8> Ops;

	// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
	if (Splat) {
	if (Splat.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);

	Splat = CreateMMXElement(Splat);

	if (Subtarget.hasSSE1()) {
	// Unpack v8i8 to splat i8 elements to lowest 16-bits.
	if (NumElts == 8)
	Splat = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
	Splat);

	// Use PSHUFW to repeat 16-bit elements.
	unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
	DAG.getConstant(ShufMask, DL, MVT::i8));
	}
	Ops.append(NumElts, Splat);
	} else {
	for (unsigned i = 0; i != NumElts; ++i)
	Ops.push_back(CreateMMXElement(BV->getOperand(i)));
	}

	// Use tree of PUNPCKLs to build up general MMX vector.
	while (Ops.size() > 1) {
	unsigned NumOps = Ops.size();
	unsigned IntrinOp =
	(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
	: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
	: Intrinsic::x86_mmx_punpcklbw));
	SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
	for (unsigned i = 0; i != NumOps; i += 2)
	Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
	Ops[i], Ops[i + 1]);
	Ops.resize(NumOps / 2);
	}

	return Ops[0];
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize()) {
	SDLoc dl(N);
	if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
	return V;

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((VT == MVT::v4i1 \|\| VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
	N0 = DAG.getBitcast(MVT::v8i1, N0);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
	DAG.getIntPtrConstant(0, dl));
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((SrcVT == MVT::v4i1 \|\| SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
	Ops[0] = N0;
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.
	if (VT == MVT::x86mmx) {
	// Detect MMX constant vectors.
	APInt UndefElts;
	SmallVector<APInt, 1> EltBits;
	if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
	SDLoc DL(N0);
	// Handle zero-extension of i32 with MOVD.
	if (EltBits[0].countLeadingZeros() >= 32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
	DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
	// Else, bitcast to a double.
	// TODO - investigate supporting sext 32-bit immediates on x86_64.
	APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
	return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
	}

	// Detect bitcasts to x86mmx low word.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8) &&
	N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
	bool LowUndef = true, AllUndefOrZero = true;
	for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
	SDValue Op = N0.getOperand(i);
	LowUndef &= Op.isUndef() \|\| (i >= e/2);
	AllUndefOrZero &= (Op.isUndef() \|\| isNullConstant(Op));
	}
	if (AllUndefOrZero) {
	SDValue N00 = N0.getOperand(0);
	SDLoc dl(N00);
	N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
	: DAG.getZExtOrTrunc(N00, dl, MVT::i32);
	return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
	}
	}

	// Detect bitcasts of 64-bit build vectors and convert to a
	// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
	// lowest element.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2f32 \|\| SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\|
	SrcVT == MVT::v8i8))
	return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}
	}

	// Try to remove a bitcast of constant vXi1 vector. We have to legalize
	// most of these to scalar anyway.
	if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
	SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
	return combinevXi1ConstantToInteger(N0, DAG);
	}

	if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
	VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	isa<ConstantSDNode>(N0)) {
	auto *C = cast<ConstantSDNode>(N0);
	if (C->isAllOnesValue())
	return DAG.getConstant(1, SDLoc(N0), VT);
	if (C->isNullValue())
	return DAG.getConstant(0, SDLoc(N0), VT);
	}

	// Try to remove bitcasts from input and output of mask arithmetic to
	// remove GPR<->K-register crossings.
	if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
	return V;

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Given a ABS node, detect the following pattern:
	// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
	SDValue AbsOp1 = Abs->getOperand(0);
	if (AbsOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = AbsOp1.getOperand(0);
	Op1 = AbsOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
	auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
	};
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
	PSADBWBuilder);
	}

	// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
	// PHMINPOSUW.
	static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE41.
	if (!Subtarget.hasSSE41())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
	return SDValue();

	// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Src = DAG.matchBinOpReduction(
	Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
	if (!Src)
	return SDValue();

	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getScalarType();
	if (SrcSVT != ExtractVT \|\| (SrcVT.getSizeInBits() % 128) != 0)
	return SDValue();

	SDLoc DL(Extract);
	SDValue MinPos = Src;

	// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
	while (SrcVT.getSizeInBits() > 128) {
	unsigned NumElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = NumElts / 2;
	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
	unsigned SubSizeInBits = SrcVT.getSizeInBits();
	SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
	SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
	MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
	}
	assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) \|\|
	(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
	"Unexpected value type");

	// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
	// to flip the value accordingly.
	SDValue Mask;
	unsigned MaskEltsBits = ExtractVT.getSizeInBits();
	if (BinOp == ISD::SMAX)
	Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::SMIN)
	Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::UMAX)
	Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	// For v16i8 cases we need to perform UMIN on pairs of byte elements,
	// shuffling each upper element down and insert zeros. This means that the
	// v16i8 UMIN will leave the upper element as zero, performing zero-extension
	// ready for the PHMINPOS.
	if (ExtractVT == MVT::i8) {
	SDValue Upper = DAG.getVectorShuffle(
	SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
	{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
	MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
	}

	// Perform the PHMINPOS on a v8i16 vector,
	MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
	MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
	MinPos = DAG.getBitcast(SrcVT, MinPos);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
	DAG.getIntPtrConstant(0, DL));
	}

	// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
	return SDValue();

	// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
	if (!Match && ExtractVT == MVT::i1)
	Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
	if (!Match)
	return SDValue();

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	return SDValue();

	SDValue Movmsk;
	SDLoc DL(Extract);
	EVT MatchVT = Match.getValueType();
	unsigned NumElts = MatchVT.getVectorNumElements();

	if (ExtractVT == MVT::i1) {
	// Special case for (pre-legalization) vXi1 reductions.
	if (NumElts > 32)
	return SDValue();
	if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
	// If this is a legal AVX512 predicate type then we can just bitcast.
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = DAG.getBitcast(MovmskVT, Match);
	} else {
	// Use combineBitcastvxi1 to create the MOVMSK.
	if (NumElts == 32 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	NumElts = 16;
	}
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
	}
	if (!Movmsk)
	return SDValue();
	Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
	} else {
	// Bail with AVX512VL (which uses predicate registers).
	if (Subtarget.hasVLX())
	return SDValue();

	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 && Subtarget.hasAVX())))
	return SDValue();

	// Make sure this isn't a vector of 1 element. The perf win from using
	// MOVMSK diminishes with less elements in the reduction, but it is
	// generally better to get the comparison over to the GPRs as soon as
	// possible to reduce the number of vector ops.
	if (Match.getValueType().getVectorNumElements() < 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	MatchSizeInBits = Match.getValueSizeInBits();
	}

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskSrcVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
	Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
	NumElts = MaskSrcVT.getVectorNumElements();
	}
	assert(NumElts <= 32 && "Not expecting more than 32 elements");

	if (BinOp == ISD::XOR) {
	// parity -> (AND (CTPOP(MOVMSK X)), 1)
	SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
	SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
	Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
	return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
	}

	SDValue CmpC;
	ISD::CondCode CondCode;
	if (BinOp == ISD::OR) {
	// any_of -> MOVMSK != 0
	CmpC = DAG.getConstant(0, DL, MVT::i32);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
	CondCode = ISD::CondCode::SETEQ;
	}

	// The setcc produces an i8 of 0/1, so extend that to the result width and
	// negate to get the final 0/-1 mask value.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetccVT =
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
	SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
	SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
	SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
	return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Verify the type we're extracting from is any integer type above i16.
	EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.useBWIRegs())
	RegSize = 512;
	else if (Subtarget.hasAVX())
	RegSize = 256;

	// We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();

	// Match shuffle + add pyramid.
	ISD::NodeType BinOp;
	SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| Root.getOpcode() != ISD::ABS)
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	MVT Type = Extract->getSimpleValueType(0);
	unsigned TypeSizeInBits = Type.getSizeInBits();
	// Return the lowest TypeSizeInBits bits.
	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getBitcast(ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	SDValue SrcBC = peekThroughBitcasts(Src);

	// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
	if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
	SDValue SrcOp = SrcBC.getOperand(0);
	if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, SrcOp);
	}

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	scaleShuffleMask<int>(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	// Simplify Mask based on demanded element.
	int ExtractIdx = (int)N->getConstantOperandVal(1);
	int Scale = Mask.size() / NumSrcElts;
	int Lo = Scale * ExtractIdx;
	int Hi = Scale * (ExtractIdx + 1);
	for (int i = 0, e = (int)Mask.size(); i != e; ++i)
	if (i < Lo \|\| Hi <= i)
	Mask[i] = SM_SentinelUndef;

	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[N->getConstantOperandVal(1)];
	SDLoc dl(N);

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
	"Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	return DAG.getZExtOrTrunc(ExtOp, dl, VT);
	}

	return SDValue();
	}

	/// Extracting a scalar FP value from vector element 0 is free, so extract each
	/// operand first, then perform the math as a scalar op.
	static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
	SDValue Vec = ExtElt->getOperand(0);
	SDValue Index = ExtElt->getOperand(1);
	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Vec.getValueType();

	// TODO: If this is a unary/expensive/expand op, allow extraction from a
	// non-zero element because the shuffle+scalar op will be cheaper?
	if (!Vec.hasOneUse() \|\| !isNullConstant(Index) \|\| VecVT.getScalarType() != VT)
	return SDValue();

	// Vector FP compares don't fit the pattern of FP math ops (propagate, not
	// extract, the condition code), so deal with those as a special-case.
	if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
	EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
	if (OpVT != MVT::f32 && OpVT != MVT::f64)
	return SDValue();

	// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(1), Index);
	return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
	}

	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Vector FP selects don't fit the pattern of FP math ops (because the
	// condition has a different type and we have to change the opcode), so deal
	// with those here.
	// FIXME: This is restricted to pre type legalization by ensuring the setcc
	// has i1 elements. If we loosen this we need to convert vector bool to a
	// scalar bool.
	if (Vec.getOpcode() == ISD::VSELECT &&
	Vec.getOperand(0).getOpcode() == ISD::SETCC &&
	Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
	Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
	// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	Vec.getOperand(0).getValueType().getScalarType(),
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(1), Index);
	SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(2), Index);
	return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
	}

	// TODO: This switch could include FNEG and the x86-specific FP logic ops
	// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
	// missed load folding and fma+fneg combining.
	switch (Vec.getOpcode()) {
	case ISD::FMA: // Begin 3 operands
	case ISD::FMAD:
	case ISD::FADD: // Begin 2 operands
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FCOPYSIGN:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXNUM_IEEE:
	case ISD::FMAXIMUM:
	case ISD::FMINIMUM:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case ISD::FABS: // Begin 1 operand
	case ISD::FSQRT:
	case ISD::FRINT:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FROUND:
	case ISD::FFLOOR:
	case X86ISD::FRCP:
	case X86ISD::FRSQRT: {
	// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
	SDLoc DL(ExtElt);
	SmallVector<SDValue, 4> ExtOps;
	for (SDValue Op : Vec->ops())
	ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
	return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
	}
	default:
	return SDValue();
	}
	llvm_unreachable("All opcodes should return within switch");
	}

	/// Try to convert a vector reduction sequence composed of binops and shuffles
	/// into horizontal ops.
	static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
	return SDValue();
	SDValue Index = ExtElt->getOperand(1);
	if (!isNullConstant(Index))
	return SDValue();

	// TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
	ISD::NodeType Opc;
	SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
	if (!Rdx)
	return SDValue();

	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = ExtElt->getOperand(0).getValueType();
	if (VecVT.getScalarType() != VT)
	return SDValue();

	unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
	SDLoc DL(ExtElt);

	// 256-bit horizontal instructions operate on 128-bit chunks rather than
	// across the whole vector, so we need an extract + hop preliminary stage.
	// This is the only step where the operands of the hop are not the same value.
	// TODO: We could extend this to handle 512-bit or even longer vectors.
	if (((VecVT == MVT::v16i16 \|\| VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) \|\|
	((VecVT == MVT::v8f32 \|\| VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
	unsigned NumElts = VecVT.getVectorNumElements();
	SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
	SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
	VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
	Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
	}
	if (!((VecVT == MVT::v8i16 \|\| VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
	!((VecVT == MVT::v4f32 \|\| VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
	return SDValue();

	// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
	assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
	unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
	for (unsigned i = 0; i != ReductionSteps; ++i)
	Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);
	bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

	if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

	// Integer Constant Folding.
	if (CIdx && VT.isInteger()) {
	APInt UndefVecElts;
	SmallVector<APInt, 16> EltBits;
	unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
	if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
	EltBits, true, false)) {
	uint64_t Idx = CIdx->getZExtValue();
	if (UndefVecElts[Idx])
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
	return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
	dl, VT);
	}
	}

	// TODO - Remove this once we can handle the implicit zero-extension of
	// X86ISD::PEXTRW/X86ISD::PEXTRB in:
	// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	if (IsPextr) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(
	SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
	return SDValue(N, 0);
	return SDValue();
	}

	if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
	return NewOp;

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
	if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
	return MinMax;

	if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
	return V;

	if (SDValue V = scalarizeExtEltFP(N, DAG))
	return V;

	// Attempt to extract a i1 element by using MOVMSK to extract the signbits
	// and then testing the relevant element.
	if (CIdx && SrcVT.getScalarType() == MVT::i1) {
	SmallVector<SDNode *, 16> BoolExtracts;
	auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
	if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(Use->getOperand(1)) &&
	Use->getValueType(0) == MVT::i1) {
	BoolExtracts.push_back(Use);
	return true;
	}
	return false;
	};
	if (all_of(InputVector->uses(), IsBoolExtract) &&
	BoolExtracts.size() > 1) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
	if (SDValue BC =
	combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
	for (SDNode *Use : BoolExtracts) {
	// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
	unsigned MaskIdx = Use->getConstantOperandVal(1);
	APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
	SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
	SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
	Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
	DCI.CombineTo(Use, Res);
	}
	return SDValue(N, 0);
	}
	}
	}

	return SDValue();
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	// Check if the first operand is all zeros and Cond type is vXi1.
	// This situation only applies to avx512.
	// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
	// TODO: Can we assert that both operands are not zeros (because that should
	// get simplified at node creation time)?
	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
	if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
	Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s. Only do this if the condition has one use.
	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC =
	ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	Cond.getOperand(0).getValueType().isInteger());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// Cond value must be 'sign splat' to be converted to a logical op.
	if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
	return SDValue();

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	// vselect Cond, 000..., X -> andn Cond, X
	if (TValIsAllZeros) {
	MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
	SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
	SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
	SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
	return DAG.getBitcast(VT, AndN);
	}

	return SDValue();
	}

	/// If both arms of a vector select are concatenated vectors, split the select,
	/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
	/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
	/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
	static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
	return SDValue();

	// TODO: Split 512-bit vectors too?
	EVT VT = N->getValueType(0);
	if (!VT.is256BitVector())
	return SDValue();

	// TODO: Split as long as any 2 of the 3 operands are concatenated?
	SDValue Cond = N->getOperand(0);
	SDValue TVal = N->getOperand(1);
	SDValue FVal = N->getOperand(2);
	SmallVector<SDValue, 4> CatOpsT, CatOpsF;
	if (!TVal.hasOneUse() \|\| !FVal.hasOneUse() \|\|
	!collectConcatOps(TVal.getNode(), CatOpsT) \|\|
	!collectConcatOps(FVal.getNode(), CatOpsF))
	return SDValue();

	auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
	makeBlend, /CheckBWI/ false);
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	EVT VT = N->getValueType(0);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// We're going to use the condition bit in math or logic ops. We could allow
	// this with a wider condition value (post-legalization it becomes an i8),
	// but if nothing is creating selects that late, it doesn't matter.
	if (Cond.getValueType() != MVT::i1)
	return SDValue();

	// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
	// 3, 5, or 9 with i32/i64, so those get transformed too.
	// TODO: For constants that overflow or do not differ by power-of-2 or small
	// multiplier, convert to 'and' + 'add'.
	const APInt &TrueVal = TrueC->getAPIntValue();
	const APInt &FalseVal = FalseC->getAPIntValue();
	bool OV;
	APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
	if (OV)
	return SDValue();

	APInt AbsDiff = Diff.abs();
	if (AbsDiff.isPowerOf2() \|\|
	((VT == MVT::i32 \|\| VT == MVT::i64) &&
	(AbsDiff == 3 \|\| AbsDiff == 5 \|\| AbsDiff == 9))) {

	// We need a positive multiplier constant for shift/LEA codegen. The 'not'
	// of the condition can usually be folded into a compare predicate, but even
	// without that, the sequence should be cheaper than a CMOV alternative.
	if (TrueVal.slt(FalseVal)) {
	Cond = DAG.getNOT(DL, Cond, MVT::i1);
	std::swap(TrueC, FalseC);
	}

	// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
	SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

	// Multiply condition by the difference if non-one.
	if (!AbsDiff.isOneValue())
	R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

	// Add the base if non-zero.
	if (!FalseC->isNullValue())
	R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

	return R;
	}

	return SDValue();
	}

	/// If this is a dynamic select (non-constant condition) and we can match
	/// this node with one of the variable blend instructions, restructure the
	/// condition so that blends can use the high (sign) bit of each element.
	/// This function will also call SimplifyDemandedBits on already created
	/// BLENDV to perform additional simplifications.
	static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	if ((N->getOpcode() != ISD::VSELECT &&
	N->getOpcode() != X86ISD::BLENDV) \|\|
	ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();

	// Don't optimize before the condition has been transformed to a legal type
	// and don't ever optimize vector selects that map to AVX512 mask-registers.
	unsigned BitWidth = Cond.getScalarValueSizeInBits();
	if (BitWidth < 8 \|\| BitWidth > 64)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = N->getValueType(0);
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();
	// There are no 512-bit blend instructions that use sign bits.
	if (VT.is512BitVector())
	return SDValue();

	// TODO: Add other opcodes eventually lowered into BLEND.
	for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
	UI != UE; ++UI)
	if ((UI->getOpcode() != ISD::VSELECT &&
	UI->getOpcode() != X86ISD::BLENDV) \|\|
	UI.getOperandNo() != 0)
	return SDValue();

	APInt DemandedMask(APInt::getSignMask(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
	return SDValue();

	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Update all the nodes so that we do not use
	// the generic VSELECT anymore. Otherwise, we may perform wrong
	// optimizations as we messed with the actual expectation for the vector
	// boolean values.
	for (SDNode *U : Cond->uses()) {
	if (U->getOpcode() == X86ISD::BLENDV)
	continue;

	SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
	Cond, U->getOperand(1), U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	DCI.AddToWorklist(U);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);

	// Try simplification again because we use this function to optimize
	// BLENDV nodes that are not handled by the generic combiner.
	if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
	return V;

	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Convert vselects with constant condition into shuffles.
	if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
	DCI.isBeforeLegalizeOps()) {
	SmallVector<int, 64> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
	}

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	(!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) &&
	!DAG.isKnownNeverZeroFloat(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// Some mask scalar intrinsics rely on checking if only one bit is set
	// and implement it in C code like this:
	// A[0] = (U & 1) ? A[0] : W[0];
	// This creates some redundant instructions that break pattern matching.
	// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
	if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
	Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 \|\| VT == MVT::f64)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	SDValue AndNode = Cond.getOperand(0);
	if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	isOneConstant(AndNode.getOperand(1))) {
	// LHS and RHS swapped due to
	// setcc outputting 1 when AND resulted in 0 and vice versa.
	AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
	return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
	}
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation all vectors of i8 and i16 without BWI.
	// Make sure we extend these even before type legalization gets a chance to
	// split wide vectors.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(ExperimentalVectorWideningLegalization \|\|
	VT.getVectorNumElements() > 4) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	// AVX512 - Extend select with zero to merge with target shuffle.
	// select(mask, extract_subvector(shuffle(x)), zero) -->
	// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
	// TODO - support non target shuffles as well.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1) {
	auto SelectableOp = [&TLI](SDValue Op) {
	return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isTargetShuffle(Op.getOperand(0).getOpcode()) &&
	isNullConstant(Op.getOperand(1)) &&
	TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
	Op.hasOneUse() && Op.getOperand(0).hasOneUse();
	};

	bool SelectableLHS = SelectableOp(LHS);
	bool SelectableRHS = SelectableOp(RHS);
	bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

	if ((SelectableLHS && ZeroRHS) \|\| (SelectableRHS && ZeroLHS)) {
	EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
	: RHS.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
	LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
	VT.getSizeInBits());
	RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
	VT.getSizeInBits());
	Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
	DAG.getUNDEF(SrcCondVT), Cond,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
	return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
	}
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	Other->getOperand(0) == Cond.getOperand(0)) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
	if (isa<BuildVectorSDNode>(CondRHS)) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	auto MatchUSUBSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return (!Op && !Cond) \|\|
	(Op && Cond &&
	Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
	};
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
	/AllowUndefs/ true)) {
	OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	OpRHS);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask()) {
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}
	}
	}
	}

	// Match VSELECTs into add with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// paddus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	SDValue CondLHS = Cond->getOperand(0);
	SDValue CondRHS = Cond->getOperand(1);

	// Check if one of the arms of the VSELECT is vector with all bits set.
	// If it's on the left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
	SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);

	// Canonicalize condition operands.
	if (CC == ISD::SETUGE) {
	std::swap(CondLHS, CondRHS);
	CC = ISD::SETULE;
	}

	// We can test against either of the addition operands.
	// x <= x+y ? x+y : ~0 --> addus x, y
	// x+y >= x ? x+y : ~0 --> addus x, y
	if (CC == ISD::SETULE && Other == CondRHS &&
	(OpLHS == CondLHS \|\| OpRHS == CondLHS))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);

	if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
	CondLHS == OpLHS) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > ~C ? x+C : ~0 --> addus x, C
	auto MatchUADDSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return Cond->getAPIntValue() == ~Op->getAPIntValue();
	};
	if (CC == ISD::SETULE &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
	return V;

	// Custom action for SELECT MMX
	if (VT == MVT::x86mmx) {
	LHS = DAG.getBitcast(MVT::i64, LHS);
	RHS = DAG.getBitcast(MVT::i64, RHS);
	SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
	return DAG.getBitcast(VT, newSelect);
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	unsigned Opc = CmpLHS.getOpcode();
	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC)
	return SDValue();

	APInt Comparison = CmpRHSC->getAPIntValue();

	// If the addend is the negation of the comparison value, then we can do
	// a full comparison by emitting the atomic arithmetic as a locked sub.
	if (Comparison == -Addend) {
	// The CC is fine, but we need to rewrite the LHS of the comparison as an
	// atomic sub.
	auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
	auto AtomicSub = DAG.getAtomic(
	ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
	/Chain/ CmpLHS.getOperand(0), /LHS/ CmpLHS.getOperand(1),
	/RHS/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
	AN->getMemOperand());
	auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// We can handle comparisons with zero in a number of cases by manipulating
	// the CC used.
	if (!Comparison.isNullValue())
	return SDValue();

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
	if (EFLAGS.getOpcode() == X86ISD::ADD) {
	if (isAllOnesConstant(EFLAGS.getOperand(1))) {
	SDValue Carry = EFLAGS.getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);
	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
	uint64_t CarryCC = Carry.getConstantOperandVal(0);
	SDValue CarryOp1 = Carry.getOperand(1);
	if (CarryCC == X86::COND_B)
	return CarryOp1;
	if (CarryCC == X86::COND_A) {
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp
	// instruction cannot take an immediate as its first operand.
	//
	if (CarryOp1.getOpcode() == X86ISD::SUB &&
	CarryOp1.getNode()->hasOneUse() &&
	CarryOp1.getValueType().isInteger() &&
	!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
	SDValue SubCommute =
	DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
	CarryOp1.getOperand(1), CarryOp1.getOperand(0));
	return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
	}
	}
	// If this is a check of the z flag of an add with 1, switch to the
	// C flag.
	if (CarryCC == X86::COND_E &&
	CarryOp1.getOpcode() == X86ISD::ADD &&
	isOneConstant(CarryOp1.getOperand(1)))
	return CarryOp1;
	}
	}
	}

	return SDValue();
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (CC == X86::COND_B)
	if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
	return Flags;

	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;
	return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	// cmov X, X, ?, ? --> X
	if (TrueOp == FalseOp)
	return TrueOp;

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

	bool isFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = { FalseOp, Cond.getOperand(0),
	DAG.getConstant(CC, DL, MVT::i8), Cond };
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
	Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	return CMOV;
	}
	}

	// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
	// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
	// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
	// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
	if ((CC == X86::COND_NE \|\| CC == X86::COND_E) &&
	Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
	SDValue Add = TrueOp;
	SDValue Const = FalseOp;
	// Canonicalize the condition code for easier matching and output.
	if (CC == X86::COND_E)
	std::swap(Add, Const);

	// We might have replaced the constant in the cmov with the LHS of the
	// compare. If so change it to the RHS of the compare.
	if (Const == Cond.getOperand(0))
	Const = Cond.getOperand(1);

	// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
	if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
	Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
	(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF \|\|
	Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
	Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
	EVT VT = N->getValueType(0);
	// This should constant fold.
	SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
	SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
	DAG.getConstant(X86::COND_NE, DL, MVT::i8),
	Cond);
	return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	IsPositive[i] = DAG.SignBitIsZero(Opd);
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();
	if ((NumElts % 2) != 0)
	return SDValue();

	unsigned RegSize = 128;
	MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	if (ExperimentalVectorWideningLegalization \|\|
	NumElts >= OpsVT.getVectorNumElements()) {
	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == MULU8 \|\| Mode == MULS8)
	return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
	DL, VT, MulLo);

	MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(NumElts);
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + NumElts;
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getBitcast(ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i + NumElts / 2;
	ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getBitcast(ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}

	// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
	// to legalize the mul explicitly because implicit legalization for type
	// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
	// instructions which will not exist when we explicitly legalize it by
	// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
	// <4 x i16> undef).
	//
	// Legalize the operands of mul.
	// FIXME: We may be able to handle non-concatenated vectors by insertion.
	unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
	if ((RegSize % ReducedSizeInBits) != 0)
	return SDValue();

	SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
	DAG.getUNDEF(ReducedVT));
	Ops[0] = NewN0;
	NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
	Ops[0] = NewN1;
	NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

	if (Mode == MULU8 \|\| Mode == MULS8) {
	// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
	// part is needed.
	SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

	// convert the type of mul result to VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
	: ISD::SIGN_EXTEND_VECTOR_INREG,
	DL, ResVT, Mul);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
	// MULU16/MULS16, both parts are needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	OpsVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result. Make sure the type of mul result is VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
	Res = DAG.getBitcast(ResVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, const SDLoc &DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mul1, DL, VT));
	Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
	DAG.getConstant(Mul2, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 41:
	// mul x, 41 => add ((shl (mul x, 5), 3), x)
	return combineMulShlAddOrSub(5, 3, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => add ((shl (mul x, 9), 1), x)
	return combineMulShlAddOrSub(9, 1, /isAdd/ true);
	case 37:
	// mul x, 37 => add ((shl (mul x, 9), 2), x)
	return combineMulShlAddOrSub(9, 2, /isAdd/ true);
	case 73:
	// mul x, 73 => add ((shl (mul x, 9), 3), x)
	return combineMulShlAddOrSub(9, 3, /isAdd/ true);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 23 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 26:
	// mul x, 26 => add ((mul (mul x, 5), 5), x)
	return combineMulMulAddOrSub(5, 5, /isAdd/ true);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(9, 3, /isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(9, 3, /isAdd/ true));
	}

	// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
	// by a single LEA.
	// First check if this a sum of two power of 2s because that's easy. Then
	// count how many zeros are up to the first bit.
	// TODO: We can do this even without LEA at a cost of two shifts and an add.
	if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
	unsigned ScaleShift = countTrailingZeros(MulAmt);
	if (ScaleShift >= 1 && ScaleShift < 4) {
	unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
	SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ScaleShift, DL, MVT::i8));
	return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
	}
	}

	return SDValue();
	}

	// If the upper 17 bits of each element are zero then we can use PMADDWD,
	// which is always at least as quick as PMULLD, except on KNL.
	static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Subtarget.isPMADDWDSlow())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi32 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32)
	return SDValue();

	// Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
	// Also allow v2i32 if it will be widened.
	MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
	if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) \|\|
	DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// If we are zero extending two steps without SSE4.1, its better to reduce
	// the vmul width instead.
	if (!Subtarget.hasSSE41() &&
	(N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
	(N1.getOpcode() == ISD::ZERO_EXTEND &&
	N1.getOperand(0).getScalarValueSizeInBits() <= 8))
	return SDValue();

	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (!DAG.MaskedValueIsZero(N1, Mask17) \|\|
	!DAG.MaskedValueIsZero(N0, Mask17))
	return SDValue();

	// Use SplitOpsAndApply to handle AVX splitting.
	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
	return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
	{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
	PMADDWDBuilder);
	}

	static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi64 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i64 \|\|
	VT.getVectorNumElements() < 2 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
	DAG.ComputeNumSignBits(N1) > 32) {
	auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULDQBuilder, /CheckBWI/false);
	}

	// If the upper bits are zero we can use a single pmuludq.
	APInt Mask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
	auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULUDQBuilder, /CheckBWI/false);
	}

	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
	return V;

	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	if (isPowerOf2_64(C->getZExtValue()))
	return SDValue();

	int64_t SignMulAmt = C->getSExtValue();
	assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
	uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

	SDLoc DL(N);
	if (AbsMulAmt == 3 \|\| AbsMulAmt == 5 \|\| AbsMulAmt == 9) {
	SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(AbsMulAmt, DL, VT));
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);

	return NewMul;
	}

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((AbsMulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = AbsMulAmt / 9;
	} else if ((AbsMulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = AbsMulAmt / 5;
	} else if ((AbsMulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = AbsMulAmt / 3;
	}

	SDValue NewMul;
	// For negative multiply amounts, only allow MulAmt2 to be a power of 2.
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\|
	(SignMulAmt >= 0 && (MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)))) {

	if (isPowerOf2_64(MulAmt2) &&
	!(SignMulAmt >= 0 && N->hasOneUse() &&
	N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add. Only do this for positive multiply amounts since the
	// negate would prevent it from being used as an address mode anyway.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));

	// Negate the result.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

	if (!NewMul) {
	assert(C->getZExtValue() != 0 &&
	C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	if (isPowerOf2_64(AbsMulAmt - 1)) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
	MVT::i8)));
	// To negate, subtract the number from zero
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT), NewMul);
	} else if (isPowerOf2_64(AbsMulAmt + 1)) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 1),
	DL, MVT::i8));
	// To negate, reverse the operands of the subtract.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
	else
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
	// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
	// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	}
	}

	return NewMul;
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = N0.getConstantOperandAPInt(1);
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->getAPIntValue() == 1)
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize >= Size \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// Only do this on the last DAG combine as it can interfere with other
	// combines.
	if (!DCI.isAfterLegalizeDAG())
	return SDValue();

	// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
	// TODO: This is a generic DAG combine that became an x86-only combine to
	// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
	// and-not ('andn').
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
	auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!ShiftC \|\| !AndC)
	return SDValue();

	// If we can shrink the constant mask below 8-bits or 32-bits, then this
	// transform should reduce code size. It may also enable secondary transforms
	// from improved known-bits analysis or instruction selection.
	APInt MaskVal = AndC->getAPIntValue();

	// If this can be matched by a zero extend, don't optimize.
	if (MaskVal.isMask()) {
	unsigned TO = MaskVal.countTrailingOnes();
	if (TO >= 8 && isPowerOf2_32(TO))
	return SDValue();
	}

	APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
	unsigned OldMaskSize = MaskVal.getMinSignedBits();
	unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
	if ((OldMaskSize > 8 && NewMaskSize <= 8) \|\|
	(OldMaskSize > 32 && NewMaskSize <= 32)) {
	// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
	}
	return SDValue();
	}

	static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected shift opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned DstBitsPerElt = VT.getScalarSizeInBits();
	unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
	assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
	N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
	"Unexpected PACKSS/PACKUS input type");

	bool IsSigned = (X86ISD::PACKSS == Opcode);

	// Constant Folding.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if ((N0.isUndef() \|\| N->isOnlyUserOf(N0.getNode())) &&
	(N1.isUndef() \|\| N->isOnlyUserOf(N1.getNode())) &&
	getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
	getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumDstElts = VT.getVectorNumElements();
	unsigned NumSrcElts = NumDstElts / 2;
	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

	APInt Undefs(NumDstElts, 0);
	SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
	auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

	if (UndefElts[SrcIdx]) {
	Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
	continue;
	}

	APInt &Val = EltBits[SrcIdx];
	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstBitsPerElt);
	else
	Val = APInt::getSignedMaxValue(DstBitsPerElt);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstBitsPerElt);
	else
	Val = APInt::getAllOnesValue(DstBitsPerElt);
	}
	Bits[Lane * NumDstEltsPerLane + Elt] = Val;
	}
	}

	return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
	// truncate to create a larger truncate.
	if (Subtarget.hasAVX512() &&
	N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
	N0.getOperand(0).getValueType() == MVT::v8i32) {
	if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) \|\|
	(!IsSigned &&
	DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
	if (Subtarget.hasVLX())
	return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

	// Widen input to v16i32 so we can truncate that.
	SDLoc dl(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
	N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
	}
	}

	// Attempt to combine as shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert((X86ISD::VSHL == N->getOpcode() \|\| X86ISD::VSRA == N->getOpcode() \|\|
	X86ISD::VSRL == N->getOpcode()) &&
	"Unexpected shift opcode");
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Detect constant shift amounts.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
	unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
	return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
	EltBits[0].getZExtValue(), DAG);
	}

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");
	assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
	if (ShiftVal >= NumBitsPerElt) {
	if (LogicalShift)
	return DAG.getConstant(0, SDLoc(N), VT);
	else
	ShiftVal = NumBitsPerElt - 1;
	}

	// Shift N0 by zero -> N0.
	if (!ShiftVal)
	return N0;

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
	// clamped to (NumBitsPerElt - 1).
	if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
	unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
	unsigned NewShiftVal = ShiftVal + ShiftVal2;
	if (NewShiftVal >= NumBitsPerElt)
	NewShiftVal = NumBitsPerElt - 1;
	return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
	DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
	}

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	for (APInt &Elt : EltBits) {
	if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftVal;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftVal);
	else
	Elt.lshrInPlace(ShiftVal);
	}
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
	"Unexpected vector insertion");

	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);

	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0.getOperand(1);
	SDValue CMP1 = N1.getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getConstant(x86cc, DL, MVT::i8));
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
	DAG.getConstant(0, DL, MVT::v16i1),
	FSetCC, DAG.getIntPtrConstant(0, DL));
	return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
	N->getSimpleValueType(0));
	}
	SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
	CMP00.getValueType(), CMP00, CMP01,
	DAG.getConstant(x86cc, DL,
	MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	// Match (xor X, -1) -> X.
	// Match extract_subvector(xor X, -1) -> extract_subvector(X).
	// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
	static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
	V = peekThroughBitcasts(V);
	if (V.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
	return V.getOperand(0);
	if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	(isNullConstant(V.getOperand(1)) \|\| V.getOperand(0).hasOneUse())) {
	if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
	Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
	Not, V.getOperand(1));
	}
	}
	SmallVector<SDValue, 2> CatOps;
	if (collectConcatOps(V.getNode(), CatOps)) {
	for (SDValue &CatOp : CatOps) {
	SDValue NotCat = IsNOT(CatOp, DAG);
	if (!NotCat) return SDValue();
	CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDValue X, Y;
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	if (SDValue Not = IsNOT(N0, DAG)) {
	X = Not;
	Y = N1;
	} else if (SDValue Not = IsNOT(N1, DAG)) {
	X = Not;
	Y = N0;
	} else
	return SDValue();

	X = DAG.getBitcast(VT, X);
	Y = DAG.getBitcast(VT, Y);
	return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	// Even with AVX-512 this is still useful for removing casts around logical
	// operations on vXi1 mask types.
	static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Expected vector type");

	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow.getValueType();

	if (Narrow->getOpcode() != ISD::XOR &&
	Narrow->getOpcode() != ISD::AND &&
	Narrow->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = Narrow->getOperand(0);
	SDValue N1 = Narrow->getOperand(1);
	SDLoc DL(Narrow);

	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	if (N0.getOperand(0).getValueType() != VT)
	return SDValue();

	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getValueType() == VT;
	if (!RHSTrunc &&
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
	return SDValue();

	// Set N0 and N1 to hold the inputs to the new wide operation.
	N0 = N0.getOperand(0);
	if (RHSTrunc)
	N1 = N1.getOperand(0);
	else
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

	// Generate the wide operation.
	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	}
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (N0.getOpcode() != ISD::BITCAST \|\| N1.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();

	// Ensure that both types are the same and are legal scalar fp types.
	if (N00Type != N10Type \|\|
	!((Subtarget.hasSSE1() && N00Type == MVT::f32) \|\|
	(Subtarget.hasSSE2() && N00Type == MVT::f64)))
	return SDValue();

	unsigned FPOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected input node for FP logic conversion");
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	}

	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	// Don't prevent creation of ANDN.
	if (isBitwiseNot(Op0))
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	// Get the index node from the lowered DAG of a GEP IR instruction with one
	// indexing dimension.
	static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
	if (Ld->isIndexed())
	return SDValue();

	SDValue Base = Ld->getBasePtr();

	if (Base.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue ShiftedIndex = Base.getOperand(0);

	if (ShiftedIndex.getOpcode() != ISD::SHL)
	return SDValue();

	return ShiftedIndex.getOperand(0);

	}

	static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
	if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
	switch (VT.getSizeInBits()) {
	default: return false;
	case 64: return Subtarget.is64Bit() ? true : false;
	case 32: return true;
	}
	}
	return false;
	}

	// This function recognizes cases where X86 bzhi instruction can replace and
	// 'and-load' sequence.
	// In case of loading integer value from an array of constants which is defined
	// as follows:
	//
	// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
	//
	// then applying a bitwise and on the result with another input.
	// It's equivalent to performing bzhi (zero high bits) on the input, with the
	// same index of the load.
	static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	// Check if subtarget has BZHI instruction for the node's type
	if (!hasBZHI(Subtarget, VT))
	return SDValue();

	// Try matching the pattern for both operands.
	for (unsigned i = 0; i < 2; i++) {
	SDValue N = Node->getOperand(i);
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

	// continue if the operand is not a load instruction
	if (!Ld)
	return SDValue();

	const Value *MemOp = Ld->getMemOperand()->getValue();

	if (!MemOp)
	return SDValue();

	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
	if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (!isa<ConstantDataArray>(Init) \|\|
	!Ty->getArrayElementType()->isIntegerTy() \|\|
	Ty->getArrayElementType()->getScalarSizeInBits() !=
	VT.getSizeInBits() \|\|
	Ty->getArrayNumElements() >
	Ty->getArrayElementType()->getScalarSizeInBits())
	continue;

	// Check if the array's constant elements are suitable to our case.
	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	bool ConstantsMatch = true;
	for (uint64_t j = 0; j < ArrayElementCount; j++) {
	ConstantInt *Elem =
	dyn_cast<ConstantInt>(Init->getAggregateElement(j));
	if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
	ConstantsMatch = false;
	break;
	}
	}
	if (!ConstantsMatch)
	continue;

	// Do the transformation (For 32-bit type):
	// -> (and (load arr[idx]), inp)
	// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
	// that will be replaced with one bzhi instruction.
	SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
	SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

	// Get the Node which indexes into the array.
	SDValue Index = getIndexFromUnindexedLoad(Ld);
	if (!Index)
	return SDValue();
	Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

	SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
	Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
	SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

	return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
	}
	}
	}
	}
	return SDValue();
	}

	// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
	// Turn it into series of XORs and a setnp.
	static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// We only support 64-bit and 32-bit. 64-bit requires special handling
	// unless the 64-bit popcnt instruction is legal.
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// LHS needs to be a single use CTPOP.
	if (N0.getOpcode() != ISD::CTPOP \|\| !N0.hasOneUse())
	return SDValue();

	// RHS needs to be 1.
	if (!isOneConstant(N1))
	return SDValue();

	SDLoc DL(N);
	SDValue X = N0.getOperand(0);

	// If this is 64-bit, its always best to xor the two 32-bit pieces together
	// even if we have popcnt.
	if (VT == MVT::i64) {
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(32, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
	X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
	// Generate a 32-bit parity idiom. This will bring us back here if we need
	// to expand it too.
	SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
	DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
	DAG.getConstant(1, DL, MVT::i32));
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
	}
	assert(VT == MVT::i32 && "Unexpected VT!");

	// Xor the high and low 16-bits together using a 32-bit operation.
	SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(16, DL, MVT::i8));
	X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);

	// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
	// This should allow an h-reg to be used to save a shift.
	// FIXME: We only get an h-reg in 32-bit mode.
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(8, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
	SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
	SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

	// Copy the inverse of the parity flag into a register with setcc.
	SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
	// Zero extend to original type.
	return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FAND to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	// Use a 32-bit and+zext if upper bits known zero.
	if (VT == MVT::i64 && Subtarget.is64Bit() &&
	!isa<ConstantSDNode>(N->getOperand(1))) {
	APInt HiMask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) \|\|
	DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
	SDLoc dl(N);
	SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
	SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
	DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
	}
	}

	// This must be done before legalization has expanded the ctpop.
	if (SDValue V = combineParity(N, DAG, Subtarget))
	return V;

	// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
	// TODO: Support multiple SrcOps.
	if (VT == MVT::i1) {
	SmallVector<SDValue, 2> SrcOps;
	if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
	SrcOps.size() == 1) {
	SDLoc dl(N);
	unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
	EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
	if (Mask) {
	APInt AllBits = APInt::getAllOnesValue(NumElts);
	return DAG.getSetCC(dl, MVT::i1, Mask,
	DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
	}
	}
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Attempt to combine a scalar bitmask AND with an extracted shuffle.
	if ((VT.getScalarSizeInBits() % 8) == 0 &&
	N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
	SDValue BitMask = N->getOperand(1);
	SDValue SrcVec = N->getOperand(0).getOperand(0);
	EVT SrcVecVT = SrcVec.getValueType();

	// Check that the constant bitmask masks whole bytes.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (VT == SrcVecVT.getScalarType() &&
	N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
	getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
	llvm::all_of(EltBits, [](APInt M) {
	return M.isNullValue() \|\| M.isAllOnesValue();
	})) {
	unsigned NumElts = SrcVecVT.getVectorNumElements();
	unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
	unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

	// Create a root shuffle mask from the byte mask and the extracted index.
	SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i) {
	if (UndefElts[i])
	continue;
	int VecIdx = Scale * Idx + i;
	ShuffleMask[VecIdx] =
	EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
	}

	if (SDValue Shuffle = combineX86ShufflesRecursively(
	{SrcVec}, 0, SrcVec, ShuffleMask, {}, /Depth/ 2,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
	N->getOperand(0).getOperand(1));
	}
	}

	return SDValue();
	}

	// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
	static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| (VT.getScalarSizeInBits() % 8) != 0)
	return SDValue();

	SDValue N0 = peekThroughBitcasts(N->getOperand(0));
	SDValue N1 = peekThroughBitcasts(N->getOperand(1));
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != ISD::AND)
	return SDValue();

	// On XOP we'll lower to PCMOV so accept one use, otherwise only
	// do this if either mask has multiple uses already.
	if (!(Subtarget.hasXOP() \|\| !N0.getOperand(1).hasOneUse() \|\|
	!N1.getOperand(1).hasOneUse()))
	return SDValue();

	// Attempt to extract constant byte masks.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
	false, false))
	return SDValue();
	if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
	false, false))
	return SDValue();

	for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
	// TODO - add UNDEF elts support.
	if (UndefElts0[i] \|\| UndefElts1[i])
	return SDValue();
	if (EltBits0[i] != ~EltBits1[i])
	return SDValue();
	}

	SDLoc DL(N);
	SDValue X = N->getOperand(0);
	SDValue Y =
	DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
	DAG.getBitcast(VT, N1.getOperand(0)));
	return DAG.getNode(ISD::OR, DL, VT, X, Y);
	}

	// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
	static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
	if (N->getOpcode() != ISD::OR)
	return false;

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// Attempt to match OR(AND(M,Y),ANDNP(M,X)).
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return false;

	Mask = N1.getOperand(0);
	X = N1.getOperand(1);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	else if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);
	else
	return false;

	// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	return true;
	}

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of vselect:
	// (vselect M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoConditionalNegate(
	EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	EVT MaskVT = Mask.getValueType();
	assert(MaskVT.isInteger() &&
	DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
	"Mask must be zero/all-bits");

	if (X.getValueType() != MaskVT \|\| Y.getValueType() != MaskVT)
	return SDValue();
	if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
	return SDValue();

	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};

	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;
	else
	return SDValue();

	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	EVT VT = N->getValueType(0);
	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	SDValue X, Y, Mask;
	if (!matchLogicBlend(N, X, Y, Mask))
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Attempt to combine to conditional negate: (sub (xor X, M), M)
	if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
	DAG, Subtarget))
	return Res;

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, MVT::i8));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(MVT::v4i32,
	DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N0),
	DAG.getBitcast(MVT::v4f32, N1)));
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
	return R;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine an OR of shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	unsigned Bits = VT.getScalarSizeInBits();

	// SHLD/SHRD instructions have lower register pressure, but on some
	// platforms they have higher latency than the equivalent
	// series of shifts/or that would otherwise be generated.
	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
	// have higher latencies and we are not optimizing for size.
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	SDValue ShAmt0 = N0.getOperand(1);
	if (ShAmt0.getValueType() != MVT::i8)
	return SDValue();
	SDValue ShAmt1 = N1.getOperand(1);
	if (ShAmt1.getValueType() != MVT::i8)
	return SDValue();

	// Peek through any modulo shift masks.
	SDValue ShMsk0;
	if (ShAmt0.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
	ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk0 = ShAmt0;
	ShAmt0 = ShAmt0.getOperand(0);
	}
	SDValue ShMsk1;
	if (ShAmt1.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
	ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk1 = ShAmt1;
	ShAmt1 = ShAmt1.getOperand(0);
	}

	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
	ShAmt0 = ShAmt0.getOperand(0);
	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
	ShAmt1 = ShAmt1.getOperand(0);

	SDLoc DL(N);
	unsigned Opc = ISD::FSHL;
	SDValue Op0 = N0.getOperand(0);
	SDValue Op1 = N1.getOperand(0);
	if (ShAmt0.getOpcode() == ISD::SUB \|\| ShAmt0.getOpcode() == ISD::XOR) {
	Opc = ISD::FSHR;
	std::swap(Op0, Op1);
	std::swap(ShAmt0, ShAmt1);
	std::swap(ShMsk0, ShMsk1);
	}

	auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,
	SDValue Amt) {
	if (Opc == ISD::FSHR)
	std::swap(Op0, Op1);
	return DAG.getNode(Opc, DL, VT, Op0, Op1,
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));
	};

	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
	// OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
	if (ShAmt1.getOpcode() == ISD::SUB) {
	SDValue Sum = ShAmt1.getOperand(0);
	if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
	if (ShAmt1Op1.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
	ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk1 = ShAmt1Op1;
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	}
	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	if ((SumC->getAPIntValue() == Bits \|\|
	(SumC->getAPIntValue() == 0 && ShMsk1)) &&
	ShAmt1Op1 == ShAmt0)
	return GetFunnelShift(Op0, Op1, ShAmt0);
	}
	} else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
	auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
	return GetFunnelShift(Op0, Op1, ShAmt0);
	} else if (ShAmt1.getOpcode() == ISD::XOR) {
	SDValue Mask = ShAmt1.getOperand(1);
	if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
	unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
	if (MaskC->getSExtValue() == (Bits - 1) &&
	(ShAmt1Op0 == ShAmt0 \|\| ShAmt1Op0 == ShMsk0)) {
	if (Op1.getOpcode() == InnerShift &&
	isa<ConstantSDNode>(Op1.getOperand(1)) &&
	Op1.getConstantOperandAPInt(1) == 1) {
	return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
	}
	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
	Op1.getOperand(0) == Op1.getOperand(1)) {
	return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
	}
	}
	}
	}

	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt =
	isConstOrConstSplat(Shift.getOperand(1), /AllowUndefs/ true);
	if (!ShiftAmt \|\|
	ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
	}

	/// Check if truncation with saturation form type \p SrcVT to \p DstVT
	/// is valid for the given \p Subtarget.
	static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX512())
	return false;

	// FIXME: Scalar type may be supported if we move it to vector register.
	if (!SrcVT.isVector())
	return false;

	EVT SrcElVT = SrcVT.getScalarType();
	EVT DstElVT = DstVT.getScalarType();
	if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
	return false;
	if (SrcVT.is512BitVector() \|\| Subtarget.hasVLX())
	return SrcElVT.getSizeInBits() >= 32 \|\| Subtarget.hasBWI();
	return false;
	}

	/// Detect patterns of truncation with unsigned saturation:
	///
	/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value x to be truncated or SDValue() if the pattern was
	/// not matched.
	///
	/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
	/// where C1 >= 0 and C2 is unsigned max of destination type.
	///
	/// (truncate (smax (smin (x, C2), C1)) to dest_type)
	/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
	///
	/// These two patterns are equivalent to:
	/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
	/// So return the smax(x, C1) value to be truncated or SDValue() if the
	/// pattern was not matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const SDLoc &DL) {
	EVT InVT = In.getValueType();

	// Saturation with truncation. We truncate from InVT to VT.
	assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	// Match min/max and return limit value as a parameter.
	auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
	return V.getOperand(0);
	return SDValue();
	};

	APInt C1, C2;
	if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
	// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	if (C2.isMask(VT.getScalarSizeInBits()))
	return UMin;

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
	if (MatchMinMax(SMin, ISD::SMAX, C1))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
	return SMin;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
	C2.uge(C1)) {
	return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
	}

	return SDValue();
	}

	/// Detect patterns of truncation with signed saturation:
	/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
	/// signed_max_of_dest_type)) to dest_type)
	/// or:
	/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
	/// signed_min_of_dest_type)) to dest_type).
	/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
	unsigned NumDstBits = VT.getScalarSizeInBits();
	unsigned NumSrcBits = In.getScalarValueSizeInBits();
	assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");

	auto MatchMinMax = [](SDValue V, unsigned Opcode,
	const APInt &Limit) -> SDValue {
	APInt C;
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
	return V.getOperand(0);
	return SDValue();
	};

	APInt SignedMax, SignedMin;
	if (MatchPackUS) {
	SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
	SignedMin = APInt(NumSrcBits, 0);
	} else {
	SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
	SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
	}

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
	if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
	return SMax;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
	return SMin;

	return SDValue();
	}

	/// Detect a pattern of truncation with signed saturation.
	/// The types should allow to use VPMOVSS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
	const X86Subtarget &Subtarget,
	const TargetLowering &TLI) {
	if (!TLI.isTypeLegal(In.getValueType()))
	return SDValue();
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectSSatPattern(In, VT);
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// The types should allow to use VPMOVUS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const SDLoc &DL,
	const X86Subtarget &Subtarget,
	const TargetLowering &TLI) {
	if (!TLI.isTypeLegal(In.getValueType()))
	return SDValue();
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectUSatPattern(In, VT, DAG, DL);
	}

	static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT SVT = VT.getScalarType();
	EVT InVT = In.getValueType();
	EVT InSVT = InVT.getScalarType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
	isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
	if (auto SSatVal = detectSSatPattern(In, VT))
	return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
	if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
	}
	if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
	!Subtarget.hasAVX512() &&
	(SVT == MVT::i8 \|\| SVT == MVT::i16) &&
	(InSVT == MVT::i16 \|\| InSVT == MVT::i32)) {
	if (auto USatVal = detectSSatPattern(In, VT, true)) {
	// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
	if (SVT == MVT::i8 && InSVT == MVT::i32) {
	EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
	DAG, Subtarget);
	if (Mid)
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
	Subtarget);
	} else if (SVT == MVT::i8 \|\| Subtarget.hasSSE41())
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
	Subtarget);
	}
	if (auto SSatVal = detectSSatPattern(In, VT))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
	Subtarget);
	}
	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	NumElems >= 2 && isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.
	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| !BV->isConstant())
	return false;
	for (SDValue Op : V->ops()) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return false;
	const APInt &Val = C->getAPIntValue();
	if (Val.ult(Min) \|\| Val.ugt(Max))
	return false;
	}
	return true;
	};

	// Check if each element of the vector is left-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
	};

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return SplitOpsAndApply(DAG, Subtarget, DL, VT,
	{ Operands[0].getOperand(0), Operands[1] },
	AVGBuilder);
	}

	// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
	// Match the or case only if its 'add-like' - can be replaced by an add.
	auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
	if (ISD::ADD == V.getOpcode()) {
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	}
	if (ISD::ZERO_EXTEND != V.getOpcode())
	return false;
	V = V.getOperand(0);
	if (V.getValueType() != VT \|\| ISD::OR != V.getOpcode() \|\|
	!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
	return false;
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	};

	SDValue Op0, Op1;
	if (FindAddLike(Operands[0], Op0, Op1))
	std::swap(Operands[0], Operands[1]);
	else if (!FindAddLike(Operands[1], Op0, Op1))
	return SDValue();
	Operands[2] = Op0;
	Operands[1] = Op1;

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two can be promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getValueType() != VT) {
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();
	Operands[j] = Operands[j].getOperand(0);
	}

	// The pattern is detected, emit X86ISD::AVG instruction(s).
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
	AVGBuilder);
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	unsigned Alignment = Ld->getAlignment();
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	*Ld->getMemOperand(), &Fast) &&
	!Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	unsigned HalfAlign = 16;
	SDValue Ptr1 = Ld->getBasePtr();
	SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems / 2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
	Alignment, Ld->getMemOperand()->getFlags());
	SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
	Ld->getPointerInfo().getWithOffset(HalfAlign),
	MinAlign(Alignment, HalfAlign),
	Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1), Load2.getValue(1));

	SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	// Bool vector load - attempt to cast to an integer, as we have good
	// (vXiY *ext(vXi1 bitcast(iX))) handling.
	if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
	RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
	unsigned NumElts = RegVT.getVectorNumElements();
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	if (TLI.isTypeLegal(IntVT)) {
	SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Alignment,
	Ld->getMemOperand()->getFlags());
	SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
	return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
	}
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
	ML->getPassThru(), Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
	ML->getPassThru());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getPassThru().isUndef())
	return SDValue();

	if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMask(), DAG.getUNDEF(VT),
	ML->getMemoryVT(), ML->getMemOperand(),
	ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
	ML->getPassThru());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;
	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	if (Mld->getExtensionType() != ISD::EXTLOAD)
	return SDValue();

	// Resolve extending loads.
	EVT VT = Mld->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	EVT LdVT = Mld->getMemoryVT();
	SDLoc dl(Mld);

	assert(LdVT != VT && "Cannot extend to the same type");
	unsigned ToSz = VT.getScalarSizeInBits();
	unsigned FromSz = LdVT.getScalarSizeInBits();
	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for extending masked load");

	unsigned SizeRatio = ToSz / FromSz;
	assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	LdVT.getScalarType(), NumElems*SizeRatio);
	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	// Convert PassThru value.
	SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
	if (!Mld->getPassThru().isUndef()) {
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	}

	// Prepare the new mask.
	SDValue NewMask;
	SDValue Mask = Mld->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
	ShuffleVec[i] = NumElems * SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
	Mld->getBasePtr(), NewMask, WidePassThru,
	Mld->getMemoryVT(), Mld->getMemOperand(),
	ISD::NON_EXTLOAD);

	SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	SlicedVec = DAG.getBitcast(VT, SlicedVec);

	return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
	if (Mst->isCompressingStore())
	return SDValue();

	EVT VT = Mst->getValue().getValueType();
	EVT StVT = Mst->getMemoryVT();
	SDLoc dl(Mst);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!Mst->isTruncatingStore()) {
	if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
	return ScalarStore;

	// If the mask value has been legalized to a non-boolean vector, try to
	// simplify ops leading up to it. We only demand the MSB of each lane.
	SDValue Mask = Mst->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
	return SDValue(N, 0);
	}

	// TODO: AVX512 targets should also be able to simplify something like the
	// pattern above, but that pattern will be different. It will either need to
	// match setcc more generally or match PCMPGTM later (in tablegen?).

	SDValue Value = Mst->getValue();
	if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
	TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
	Mst->getMemoryVT())) {
	return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
	Mst->getBasePtr(), Mask,
	Mst->getMemoryVT(), Mst->getMemOperand(), true);
	}

	return SDValue();
	}

	// Resolve truncating stores.
	unsigned NumElems = VT.getVectorNumElements();

	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegal(VT, StVT))
	return SDValue();

	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for truncating masked store");
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	assert (((NumElems * FromSz) % ToSz) == 0 &&
	"Unexpected ratio for truncating masked store");

	unsigned SizeRatio = FromSz / ToSz;
	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");

	SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);

	SDValue NewMask;
	SDValue Mask = Mst->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
	ShuffleVec[i] = NumElems*SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
	Mst->getBasePtr(), NewMask, StVT,
	Mst->getMemOperand(), false);
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT VT = St->getValue().getValueType();
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	unsigned Alignment = St->getAlignment();
	SDValue StoredVal = St->getOperand(1);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Convert a store of vXi1 into a store of iX and a bitcast.
	if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1) {

	EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
	StoredVal = DAG.getBitcast(NewVT, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
	// This will avoid a copy to k-register.
	if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
	StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	StoredVal.getOperand(0).getValueType() == MVT::i8) {
	return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
	St->getBasePtr(), St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	}

	// Widen v2i1/v4i1 stores to v8i1.
	if ((VT == MVT::v2i1 \|\| VT == MVT::v4i1) && VT == StVT &&
	Subtarget.hasAVX512()) {
	unsigned NumConcats = 8 / VT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
	Ops[0] = StoredVal;
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// Turn vXi1 stores of constants into a scalar store.
	if ((VT == MVT::v8i1 \|\| VT == MVT::v16i1 \|\| VT == MVT::v32i1 \|\|
	VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
	ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
	// If its a v64i1 store without 64-bit support, we need two stores.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(0, 32));
	Lo = combinevXi1ConstantToInteger(Lo, DAG);
	SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(32, 32));
	Hi = combinevXi1ConstantToInteger(Hi, DAG);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
	Alignment, St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Hi, Ptr1,
	St->getPointerInfo().getWithOffset(4),
	MinAlign(Alignment, 4U),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// If we are saving a concatenation of two XMM registers and 32-byte stores
	// are slow, such as on Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*St->getMemOperand(), &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	return splitVectorStore(St, DAG);
	}

	// Split under-aligned vector non-temporal stores.
	if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
	// ZMM/YMM nt-stores - either it can be stored as a series of shorter
	// vectors or the legalizer can scalarize it to use MOVNTI.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();
	return splitVectorStore(St, DAG);
	}

	// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
	// to use MOVNTI.
	if (VT.is128BitVector() && Subtarget.hasSSE2()) {
	MVT NTVT = Subtarget.hasSSE4A()
	? MVT::v2f64
	: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
	return scalarizeVectorStore(St, NTVT, DAG);
	}
	}

	// Try to optimize v16i16->v16i8 truncating stores when BWI is not
	// supported, but avx512f is by extending to v16i32 and truncating.
	if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
	St->getValue().getOpcode() == ISD::TRUNCATE &&
	St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
	TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
	!DCI.isBeforeLegalizeOps()) {
	SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
	return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
	MVT::v16i8, St->getMemOperand());
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	if (SDValue Val =
	detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
	TLI))
	return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);
	if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
	DAG, dl, Subtarget, TLI))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);

	unsigned NumElems = VT.getVectorNumElements();
	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
	return SDValue();

	// From, To sizes and ElemCount must be pow of two
	if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	if (0 != (NumElems * FromSz) % ToSz) return SDValue();

	unsigned SizeRatio = FromSz / ToSz;

	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	if (!TLI.isTypeLegal(WideVecVT))
	return SDValue();

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);
	// At this point all of the data is stored at the bottom of the
	// register. We now need to save it to mem.

	// Find the largest store unit
	MVT StoreType = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
	StoreType = Tp;
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
	(64 <= NumElems * ToSz))
	StoreType = MVT::f64;

	// Bitcast the original vector into a vector of store-size units
	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
	StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
	SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = St->getBasePtr();

	// Perform one or more big stores into memory.
	for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	StoreType, ShuffWide,
	DAG.getIntPtrConstant(i, dl));
	SDValue Ch =
	DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
	Chains.push_back(Ch);
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function &F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if (((VT.isVector() && !VT.isFloatingPoint()) \|\|
	(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
	isa<LoadSDNode>(St->getValue()) &&
	!cast<LoadSDNode>(St->getValue())->isVolatile() &&
	St->getChain().hasOneUse() && !St->isVolatile()) {
	LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
	SmallVector<SDValue, 8> Ops;

	if (!ISD::isNormalLoad(Ld))
	return SDValue();

	// If this is not the MMX case, i.e. we are just turning i64 load/store
	// into f64 load/store, avoid the transformation if there are multiple
	// uses of the loaded value.
	if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// If we are a 64-bit capable x86, lower to a single movq load/store pair.
	// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
	// pair instead.
	if (Subtarget.is64Bit() \|\| F64IsLegal) {
	MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
	SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
	Ld->getMemOperand());

	// Make sure new load is placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
	St->getMemOperand());
	}

	// Otherwise, lower to two pairs of 32-bit loads / stores.
	SDValue LoAddr = Ld->getBasePtr();
	SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

	SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
	Ld->getPointerInfo().getWithOffset(4),
	MinAlign(Ld->getAlignment(), 4),
	Ld->getMemOperand()->getFlags());
	// Make sure new loads are placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
	DAG.makeEquivalentMemoryOrdering(Ld, HiLd);

	LoAddr = St->getBasePtr();
	HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

	SDValue LoSt =
	DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
	St->getPointerInfo().getWithOffset(4),
	MinAlign(St->getAlignment(), 4),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool IsCommutative) {
	// If either operand is undef, bail out. The binop should be simplified.
	if (LHS.isUndef() \|\| RHS.isUndef())
	return false;

	// Look for the following pattern:
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	MVT VT = LHS.getSimpleValueType();
	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");
	unsigned NumElts = VT.getVectorNumElements();

	// TODO - can we make a general helper method that does all of this for us?
	auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
	SmallVectorImpl<int> &ShuffleMask) {
	if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!Op.getOperand(0).isUndef())
	N0 = Op.getOperand(0);
	if (!Op.getOperand(1).isUndef())
	N1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
	ShuffleMask.append(Mask.begin(), Mask.end());
	return;
	}
	bool UseSubVector = false;
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Op.getOperand(0).getValueType().is256BitVector() &&
	llvm::isNullConstant(Op.getOperand(1))) {
	Op = Op.getOperand(0);
	UseSubVector = true;
	}
	bool IsUnary;
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<int, 16> SrcShuffleMask;
	SDValue BC = peekThroughBitcasts(Op);
	if (isTargetShuffle(BC.getOpcode()) &&
	getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
	SrcOps, SrcShuffleMask, IsUnary)) {
	if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
	SrcOps.size() <= 2) {
	N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
	N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
	ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
	}
	if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
	SrcOps.size() == 1) {
	N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
	N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
	ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
	ShuffleMask.append(Mask.begin(), Mask.end());
	}
	}
	};

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle, then pretend it is the identity shuffle:
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: A default initialized SDValue represents an UNDEF of type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask;
	GetShuffle(LHS, A, B, LMask);

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask;
	GetShuffle(RHS, C, D, RMask);

	// At least one of the operands should be a vector shuffle.
	unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
	if (NumShuffles == 0)
	return false;

	if (LMask.empty()) {
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask.push_back(i);
	}

	if (RMask.empty()) {
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask.push_back(i);
	}

	// If A and B occur in reverse order in RHS, then canonicalize by commuting
	// RHS operands and shuffle mask.
	if (A != C) {
	std::swap(C, D);
	ShuffleVectorSDNode::commuteMask(RMask);
	}
	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D))
	return false;

	// LHS and RHS are now:
	// LHS = shuffle A, B, LMask
	// RHS = shuffle A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	// AVX defines horizontal add/sub to operate independently on 128-bit lanes,
	// so we just repeat the inner loop if this is a 256-bit op.
	unsigned Num128BitChunks = VT.getSizeInBits() / 128;
	unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
	assert((NumEltsPer128BitChunk % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
	for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
	// Ignore undefined components.
	int LIdx = LMask[i + j], RIdx = RMask[i + j];
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// The low half of the 128-bit result must choose from A.
	// The high half of the 128-bit result must choose from B,
	// unless B is undef. In that case, we are always choosing from A.
	unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
	unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;

	// Check that successive elements are being operated on. If not, this is
	// not a horizontal operation.
	int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
	if (!(LIdx == Index && RIdx == Index + 1) &&
	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	return false;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

	if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
	return false;

	LHS = DAG.getBitcast(VT, LHS);
	RHS = DAG.getBitcast(VT, RHS);
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
	return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
	/// anything that is guaranteed to be transformed by DAGCombiner.
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned SrcOpcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsFreeTruncation = [VT](SDValue Op) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// See if this has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode = Op.getOpcode();
	if ((Opcode == ISD::ANY_EXTEND \|\| Opcode == ISD::SIGN_EXTEND \|\|
	Opcode == ISD::ZERO_EXTEND) &&
	Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if this is a single use constant which can be constant folded.
	// NOTE: We don't peek throught bitcasts here because there is currently
	// no support for constant folding truncate+bitcast+vector_of_constants. So
	// we'll just send up with a truncate on both operands which will
	// get turned back into (truncate (binop)) causing an infinite loop.
	return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!Src.hasOneUse())
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (SrcOpcode) {
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| IsFreeTruncation(Op0) \|\| IsFreeTruncation(Op1)))
	return TruncateArithmetic(Op0, Op1);
	break;
	}

	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 &&
	TLI.isOperationLegal(SrcOpcode, VT) &&
	!TLI.isOperationLegal(SrcOpcode, SrcVT))
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::ADD: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| IsFreeTruncation(Op0) \|\| IsFreeTruncation(Op1)))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	case ISD::SUB: {
	// TODO: ISD::SUB We are conservative and require both sides to be freely
	// truncatable to avoid interfering with combineSubToSubus.
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate using ISD::AND mask and X86ISD::PACKUS.
	/// e.g. trunc <8 x i32> X to <8 x i16> -->
	/// MaskX = X & 0xffff (clear high bits to prevent saturation)
	/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
	static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);

	APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
	OutVT.getScalarSizeInBits());
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
	return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);
	In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
	DAG.getValueType(OutVT));
	return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);
	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
	if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

	return SDValue();
	}

	/// This function transforms vector truncation of 'extended sign-bits' or
	/// 'extended zero-bits' values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Check we have a truncation suited for PACKSS/PACKUS.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Use PACKUS if the input has zero-bits that extend all the way to the
	// packed/truncated value. e.g. masks, zext_in_reg, etc.
	KnownBits Known = DAG.computeKnownBits(In);
	unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
	if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

	// Use PACKSS if the input has sign-bits that extend all the way to the
	// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
	if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

	return SDValue();
	}

	// Try to form a MULHU or MULHS node by looking for
	// (trunc (srl (mul ext, ext), 16))
	// TODO: This is X86 specific because we want to be able to handle wide types
	// before type legalization. But we can only do it if the vector will be
	// legalized via widening/splitting. Type legalization can't handle promotion
	// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
	// combiner.
	static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// First instruction should be a right shift of a multiply.
	if (Src.getOpcode() != ISD::SRL \|\|
	Src.getOperand(0).getOpcode() != ISD::MUL)
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Only handle vXi16 types that are at least 128-bits unless they will be
	// widened.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i16 \|\|
	(!ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() < 8))
	return SDValue();

	// Input type should be vXi32.
	EVT InVT = Src.getValueType();
	if (InVT.getVectorElementType() != MVT::i32)
	return SDValue();

	// Need a shift by 16.
	APInt ShiftAmt;
	if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) \|\|
	ShiftAmt != 16)
	return SDValue();

	SDValue LHS = Src.getOperand(0).getOperand(0);
	SDValue RHS = Src.getOperand(0).getOperand(1);

	unsigned ExtOpc = LHS.getOpcode();
	if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) \|\|
	RHS.getOpcode() != ExtOpc)
	return SDValue();

	// Peek through the extends.
	LHS = LHS.getOperand(0);
	RHS = RHS.getOperand(0);

	// Ensure the input types match.
	if (LHS.getValueType() != VT \|\| RHS.getValueType() != VT)
	return SDValue();

	unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
	return DAG.getNode(Opc, DL, VT, LHS, RHS);
	}

	// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
	// from one vector with signed bytes from another vector, adds together
	// adjacent pairs of 16-bit products, and saturates the result before
	// truncating to 16-bits.
	//
	// Which looks something like this:
	// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
	// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
	static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !Subtarget.hasSSSE3())
	return SDValue();

	unsigned NumElems = VT.getVectorNumElements();
	EVT ScalarVT = VT.getVectorElementType();
	if (ScalarVT != MVT::i16 \|\| NumElems < 8 \|\| !isPowerOf2_32(NumElems))
	return SDValue();

	SDValue SSatVal = detectSSatPattern(In, VT);
	if (!SSatVal \|\| SSatVal.getOpcode() != ISD::ADD)
	return SDValue();

	// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
	// of multiplies from even/odd elements.
	SDValue N0 = SSatVal.getOperand(0);
	SDValue N1 = SSatVal.getOperand(1);

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// TODO: Handle constant vectors and use knownbits/computenumsignbits?
	// Canonicalize zero_extend to LHS.
	if (N01.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N00, N01);
	if (N11.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N10, N11);

	// Ensure we have a zero_extend and a sign_extend.
	if (N00.getOpcode() != ISD::ZERO_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::ZERO_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Ensure the extend is from vXi8.
	if (N00.getValueType().getVectorElementType() != MVT::i8 \|\|
	N01.getValueType().getVectorElementType() != MVT::i8 \|\|
	N10.getValueType().getVectorElementType() != MVT::i8 \|\|
	N11.getValueType().getVectorElementType() != MVT::i8)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// N00/N10 are zero extended. N01/N11 are sign extended.

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue ZExtIn, SExtIn;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!ZExtIn) {
	ZExtIn = N00In;
	SExtIn = N01In;
	}
	if (ZExtIn != N00In \|\| SExtIn != N01In \|\|
	ZExtIn != N10In \|\| SExtIn != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT InVT = Ops[0].getValueType();
	assert(InVT.getScalarType() == MVT::i8 &&
	"Unexpected scalar element type");
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	InVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
	PMADDBuilder);
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to detect PMADD
	if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
	return PMAdd;

	// Try to combine truncation with signed/unsigned saturation.
	if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// Try to combine PMULHUW/PMULHW for vXi16.
	if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
	return V;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
	/// or FSUB(0, x)
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	/// This also recognizes splat of a negated value and returns the splat of that
	/// value.
	static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	EVT VT = Op->getValueType(0);
	// Make sure the element size does't change.
	if (VT.getScalarSizeInBits() != ScalarSize)
	return SDValue();

	if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
	// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
	// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
	if (!SVOp->getOperand(1).isUndef())
	return SDValue();
	if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
	if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
	return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
	SVOp->getMask());
	return SDValue();
	}
	unsigned Opc = Op.getOpcode();
	if (Opc == ISD::INSERT_VECTOR_ELT) {
	// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
	// -V, INDEX).
	SDValue InsVector = Op.getOperand(0);
	SDValue InsVal = Op.getOperand(1);
	if (!InsVector.isUndef())
	return SDValue();
	if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
	if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
	NegInsVal, Op.getOperand(2));
	return SDValue();
	}

	if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
	return SDValue();

	SDValue Op1 = Op.getOperand(1);
	SDValue Op0 = Op.getOperand(0);

	// For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
	// masks. For FSUB, we have to check if constant bits of Op0 are sign bit
	// masks and hence we swap the operands.
	if (Opc == ISD::FSUB)
	std::swap(Op0, Op1);

	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	// Extract constant bits and see if they are all sign bit masks. Ignore the
	// undef elements.
	if (getTargetConstantBitsFromNode(Op1, ScalarSize,
	UndefElts, EltBits,
	/* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false)) {
	for (unsigned I = 0, E = EltBits.size(); I < E; I++)
	if (!UndefElts[I] && !EltBits[I].isSignMask())
	return SDValue();

	return peekThroughBitcasts(Op0);
	}

	return SDValue();
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(DAG, N);
	if (!Arg)
	return SDValue();

	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	// If we're negating an FMA node, then we can adjust the
	// instruction to include the extra negation.
	unsigned NewOpcode = 0;
	if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
	switch (Arg.getOpcode()) {
	case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
	case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
	// We can't handle scalar intrinsic node here because it would only
	// invert one element and not the whole vector. But we could try to handle
	// a negation of the lower element only.
	}
	}
	if (NewOpcode)
	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
	Arg.getNode()->ops()));

	return SDValue();
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (!VT.isVector() \|\| !Subtarget.hasSSE2())
	return SDValue();

	SDLoc dl(N);

	unsigned IntBits = VT.getScalarSizeInBits();
	MVT IntSVT = MVT::getIntegerVT(IntBits);
	MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}


	/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
	static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() != ISD::XOR)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!RHSC \|\| RHSC->getZExtValue() != 1 \|\| LHS->getOpcode() != X86ISD::SETCC)
	return SDValue();

	X86::CondCode NewCC = X86::GetOppositeBranchCondition(
	X86::CondCode(LHS->getConstantOperandVal(0)));
	SDLoc DL(N);
	return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// If this is SSE1 only convert to FXOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
	N->getValueType(0) == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue SetCC = foldXor1SetCC(N, DAG))
	return SetCC;

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	return combineFneg(N, DAG, Subtarget);
	}

	static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	unsigned NumBits = VT.getSizeInBits();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// TODO - Constant Folding.
	if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
	// Reduce Cst1 to the bottom 16-bits.
	// NOTE: SimplifyDemandedBits won't do this for constants.
	const APInt &Val1 = Cst1->getAPIntValue();
	APInt MaskedVal1 = Val1 & 0xFFFF;
	if (MaskedVal1 != Val1)
	return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
	DAG.getConstant(MaskedVal1, SDLoc(N), VT));
	}

	// Only bottom 16-bits of the control bits are required.
	APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
	if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	if (V.getSimpleValueType().isVector())
	return ISD::isBuildVectorAllOnes(V.getNode());
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// Only perform optimizations if UnsafeMath is used.
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64) \|\|
	(VT.isVector() && TLI.isTypeLegal(VT))))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

	// If we don't have to respect NaN inputs, this is a direct translation to x86
	// min/max instructions.
	if (DAG.getTarget().Options.NoNaNsFPMath \|\| N->getFlags().hasNoNaNs())
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

	// If one of the operands is known non-NaN use the native min/max instructions
	// with the non-NaN input as second operand.
	if (DAG.isKnownNeverNaN(Op1))
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
	if (DAG.isKnownNeverNaN(Op0))
	return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

	// If we have to respect NaN inputs, this takes at least 3 instructions.
	// Favor a library call when operating on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	// Unless the load is volatile.
	if (!LN->isVolatile()) {
	SDLoc dl(N);
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getIntegerVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
	LN->getPointerInfo(),
	LN->getAlignment(),
	LN->getMemOperand()->getFlags());
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
	DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	// Unless the load is volatile.
	if (!LN->isVolatile()) {
	SDLoc dl(N);
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getFloatingPointVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
	LN->getPointerInfo(),
	LN->getAlignment(),
	LN->getMemOperand()->getFlags());
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
	DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);

	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Turn ANDNP back to AND if input is inverted.
	if (SDValue Not = IsNOT(N->getOperand(0), DAG))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
	N->getOperand(1));

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// BT ignores high bits in the bit index operand.
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
	return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);

	return SDValue();
	}

	// Try to combine sext_in_reg of a cmov of constants by extending the constants.
	static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	EVT DstVT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

	if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
	return SDValue();

	// Look through single use any_extends / truncs.
	SDValue IntermediateBitwidthOp;
	if ((N0.getOpcode() == ISD::ANY_EXTEND \|\| N0.getOpcode() == ISD::TRUNCATE) &&
	N0.hasOneUse()) {
	IntermediateBitwidthOp = N0;
	N0 = N0.getOperand(0);
	}

	// See if we have a single use cmov.
	if (N0.getOpcode() != X86ISD::CMOV \|\| !N0.hasOneUse())
	return SDValue();

	SDValue CMovOp0 = N0.getOperand(0);
	SDValue CMovOp1 = N0.getOperand(1);

	// Make sure both operands are constants.
	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	SDLoc DL(N);

	// If we looked through an any_extend/trunc above, add one to the constants.
	if (IntermediateBitwidthOp) {
	unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
	CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
	CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
	}

	CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
	CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

	EVT CMovVT = DstVT;
	// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
	if (DstVT == MVT::i16) {
	CMovVT = MVT::i32;
	CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
	CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
	}

	SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
	N0.getOperand(2), N0.getOperand(3));

	if (CMovVT != DstVT)
	CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

	return CMov;
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	if (SDValue V = combineSextInRegCmov(N, DAG))
	return V;

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
	N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
	// operands and the result of CMOV is not used anywhere else - promote CMOV
	// itself instead of promoting its result. This could be beneficial, because:
	// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
	// (or more) pseudo-CMOVs only when they go one-after-another and
	// getting rid of result extension code after CMOV will help that.
	// 2) Promotion of constant CMOV arguments is free, hence the
	// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
	// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
	// promotion is also good in terms of code-size.
	// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
	// promotion).
	static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
	SDValue CMovN = Extend->getOperand(0);
	if (CMovN.getOpcode() != X86ISD::CMOV \|\| !CMovN.hasOneUse())
	return SDValue();

	EVT TargetVT = Extend->getValueType(0);
	unsigned ExtendOpcode = Extend->getOpcode();
	SDLoc DL(Extend);

	EVT VT = CMovN.getValueType();
	SDValue CMovOp0 = CMovN.getOperand(0);
	SDValue CMovOp1 = CMovN.getOperand(1);

	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	// Only extend to i32 or i64.
	if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
	return SDValue();

	// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
	// are free.
	if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
	return SDValue();

	// If this a zero extend to i64, we should only extend to i32 and use a free
	// zero extend to finish.
	EVT ExtendVT = TargetVT;
	if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
	ExtendVT = MVT::i32;

	CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
	CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

	SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
	CMovN.getOperand(2), CMovN.getOperand(3));

	// Finish extending if needed.
	if (ExtendVT != TargetVT)
	Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

	return Res;
	}

	// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
	// This is more or less the reverse of combineBitcastvxi1.
	static SDValue
	combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
	Opcode != ISD::ANY_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InSVT = N0.getValueType().getScalarType();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	// Input type must be extending a bool vector (bit-casted from a scalar
	// integer) to legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
	return SDValue();
	if (InSVT != MVT::i1 \|\| N0.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	EVT SclVT = N0.getOperand(0).getValueType();
	if (!SclVT.isScalarInteger())
	return SDValue();

	SDLoc DL(N);
	SDValue Vec;
	SmallVector<int, 32> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");

	// Broadcast the scalar integer to the vector elements.
	if (NumElts > EltSizeInBits) {
	// If the scalar integer is greater than the vector element size, then we
	// must split it down into sub-sections for broadcasting. For example:
	// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
	// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
	assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
	unsigned Scale = NumElts / EltSizeInBits;
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	Vec = DAG.getBitcast(VT, Vec);

	for (unsigned i = 0; i != Scale; ++i)
	ShuffleMask.append(EltSizeInBits, i);
	} else {
	// For smaller scalar integers, we can simply any-extend it to the vector
	// element size (we don't care about the upper bits) and broadcast it to all
	// elements.
	SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	ShuffleMask.append(NumElts, 0);
	}
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

	// Now, mask the relevant bit in each element.
	SmallVector<SDValue, 32> Bits;
	for (unsigned i = 0; i != NumElts; ++i) {
	int BitIdx = (i % EltSizeInBits);
	APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
	Bits.push_back(DAG.getConstant(Bit, DL, SVT));
	}
	SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
	Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

	// Compare against the bitmask and extend the result.
	EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
	Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

	// For SEXT, this is now done, otherwise shift the result down for
	// zero-extension.
	if (Opcode == ISD::SIGN_EXTEND)
	return Vec;
	return DAG.getNode(ISD::SRL, DL, VT, Vec,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));
	}

	/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
	/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
	/// with UNDEFs) of the input to vectors of the same size as the target type
	/// which then extends the lowest elements.
	static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (ExperimentalVectorWideningLegalization)
	return SDValue();

	unsigned Opcode = N->getOpcode();
	// TODO - add ANY_EXTEND support.
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InVT = N0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// FIXME: Generic DAGCombiner previously had a bug that would cause a
	// sign_extend of setcc to sometimes return the original node and tricked it
	// into thinking CombineTo was used which prevented the target combines from
	// running.
	// Earlying out here to avoid regressions like this
	// (v4i32 (sext (v4i1 (setcc (v4i16)))))
	// Becomes
	// (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
	// Type legalized to
	// (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
	// Leading to a packssdw+pmovsxwd
	// We could write a DAG combine to fix this, but really we shouldn't be
	// creating sext_invec that's forcing v8i16 into the DAG.
	if (N0.getOpcode() == ISD::SETCC)
	return SDValue();

	// Input type must be a vector and we must be extending legal integer types.
	if (!VT.isVector() \|\| VT.getVectorNumElements() < 2)
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();

	// If the input/output types are both legal then we have at least AVX1 and
	// we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
	if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	SDLoc DL(N);

	auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
	EVT SrcVT = N.getValueType();
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
	Size / SrcVT.getScalarSizeInBits());
	SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
	DAG.getUNDEF(SrcVT));
	Opnds[0] = N;
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
	};

	// If target-size is less than 128-bits, extend to a type that would extend
	// to 128 bits, extend that and extract the original target vector.
	if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
	unsigned Scale = 128 / VT.getSizeInBits();
	EVT ExVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
	SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
	SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
	DAG.getIntPtrConstant(0, DL));
	}

	// If target-size is 128-bits (or 256-bits on AVX target), then convert to
	// ISD::_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VEXT.
	// Also use this if we don't have SSE41 to allow the legalizer do its job.
	if (!Subtarget.hasSSE41() \|\| VT.is128BitVector() \|\|
	(VT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
	SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
	Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
	return DAG.getNode(Opcode, DL, VT, ExOp);
	}

	auto SplitAndExtendInReg = [&](unsigned SplitSize) {
	unsigned NumVecs = VT.getSizeInBits() / SplitSize;
	unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
	EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

	unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
	SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
	DAG.getIntPtrConstant(Offset, DL));
	SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
	SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
	Opnds.push_back(SrcVec);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
	};

	// On pre-AVX targets, split into 128-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
	return SplitAndExtendInReg(128);

	// On pre-AVX512 targets, split into 256-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
	return SplitAndExtendInReg(256);

	return SDValue();
	}

	// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
	// result type.
	static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	// Only do this combine with AVX512 for vector extends.
	if (!Subtarget.hasAVX512() \|\| !VT.isVector() \|\| N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Only combine legal element types.
	EVT SVT = VT.getVectorElementType();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
	SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
	return SDValue();

	// We can only do this if the vector size in 256 bits or less.
	unsigned Size = VT.getSizeInBits();
	if (Size > 256)
	return SDValue();

	// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
	// that's the only integer compares with we have.
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	if (ISD::isUnsignedIntSetCC(CC))
	return SDValue();

	// Only do this combine if the extension will be fully consumed by the setcc.
	EVT N00VT = N0.getOperand(0).getValueType();
	EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
	if (Size != MatchingVecType.getSizeInBits())
	return SDValue();

	SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

	if (N->getOpcode() == ISD::ZERO_EXTEND)
	Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());

	return Res;
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
	if (NegMul) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMADD: Opcode = ISD::FMA; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
	}
	}

	if (NegAcc) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FMSUB: Opcode = ISD::FMA; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
	}
	}

	return Opcode;
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(0);
	SDValue B = N->getOperand(1);
	SDValue C = N->getOperand(2);

	auto invertIfNegative = [&DAG](SDValue &V) {
	if (SDValue NegVal = isFNEG(DAG, V.getNode())) {
	V = DAG.getBitcast(V.getValueType(), NegVal);
	return true;
	}
	// Look through extract_vector_elts. If it comes from an FNEG, create a
	// new extract from the FNEG input.
	if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isNullConstant(V.getOperand(1))) {
	if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) {
	NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
	V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
	NegVal, V.getOperand(1));
	return true;
	}
	}

	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = invertIfNegative(C);

	if (!NegA && !NegB && !NegC)
	return SDValue();

	unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}

	// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
	static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode());
	if (!NegVal)
	return SDValue();

	// FIXME: Should we bitcast instead?
	if (NegVal.getValueType() != VT)
	return SDValue();

	unsigned NewOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
	case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
	case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
	case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
	}

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal);
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
	// (and (i32 x86isd::setcc_carry), 1)
	// This eliminates the zext. This transformation is necessary because
	// ISD::SETCC is always legalized to i8.
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() == ISD::AND &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	if (!isOneConstant(N0.getOperand(1)))
	return SDValue();
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (N0.getOpcode() == ISD::TRUNCATE &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (DCI.isBeforeLegalizeOps())
	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	// TODO: Combine with any target/faux shuffle.
	if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
	VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
	unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
	APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
	if ((N00.isUndef() \|\| DAG.MaskedValueIsZero(N00, ZeroMask)) &&
	(N01.isUndef() \|\| DAG.MaskedValueIsZero(N01, ZeroMask))) {
	return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
	}
	}

	return SDValue();
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison.
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128)
	return SDValue();

	// Ignore a comparison with zero because that gets special treatment in
	// EmitTest(). But make an exception for the special case of a pair of
	// logically-combined vector-sized operands compared to zero. This pattern may
	// be generated by the memcmp expansion pass with oversized integer compares
	// (see PR33325).
	bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
	X.getOperand(0).getOpcode() == ISD::XOR &&
	X.getOperand(1).getOpcode() == ISD::XOR;
	if (isNullConstant(Y) && !IsOrXorXorCCZero)
	return SDValue();

	// Don't perform this combine if constructing the vector will be expensive.
	auto IsVectorBitCastCheap = [](SDValue X) {
	X = peekThroughBitcasts(X);
	return isa<ConstantSDNode>(X) \|\| X.getValueType().isVector() \|\|
	X.getOpcode() == ISD::LOAD;
	};
	if ((!IsVectorBitCastCheap(X) \|\| !IsVectorBitCastCheap(Y)) &&
	!IsOrXorXorCCZero)
	return SDValue();

	// TODO: Use PXOR + PTEST for SSE4.1 or later?
	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX2()) \|\|
	(OpSize == 512 && Subtarget.useAVX512Regs())) {
	EVT VecVT = OpSize == 512 ? MVT::v16i32 :
	OpSize == 256 ? MVT::v32i8 :
	MVT::v16i8;
	EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
	SDValue Cmp;
	if (IsOrXorXorCCZero) {
	// This is a bitwise-combined equality comparison of 2 pairs of vectors:
	// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
	// Use 2 vector equality compares and 'and' the results before doing a
	// MOVMSK.
	SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
	SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
	SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
	SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
	SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
	SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
	Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
	} else {
	SDValue VecX = DAG.getBitcast(VecVT, X);
	SDValue VecY = DAG.getBitcast(VecVT, Y);
	Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
	}
	// For 512-bits we want to emit a setcc that will lower to kortest.
	if (OpSize == 512)
	return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
	DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
	MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT OpVT = LHS.getValueType();
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}

	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	// Put build_vectors on the right.
	if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}

	bool IsSEXT0 =
	(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (IsSEXT0 && IsVZero1) {
	assert(VT == LHS.getOperand(0).getValueType() &&
	"Uexpected operand type");
	if (CC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (CC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (CC == ISD::SETEQ \|\| CC == ISD::SETGE)
	return DAG.getNOT(DL, LHS.getOperand(0), VT);

	assert((CC == ISD::SETNE \|\| CC == ISD::SETLT) &&
	"Unexpected condition code!");
	return LHS.getOperand(0);
	}
	}

	// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
	// pre-promote its result type since vXi1 vectors don't get promoted
	// during type legalization.
	// NOTE: The element count check is to ignore operand types that need to
	// go through type promotion to a 128-bit vector.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1 &&
	(ExperimentalVectorWideningLegalization \|\|
	VT.getVectorNumElements() > 4) &&
	(OpVT.getVectorElementType() == MVT::i8 \|\|
	OpVT.getVectorElementType() == MVT::i16)) {
	SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
	N->getOperand(2));
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue Src = N->getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = N->getSimpleValueType(0);
	unsigned NumBits = VT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// Perform constant folding.
	if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
	assert(VT == MVT::i32 && "Unexpected result type");
	APInt Imm(32, 0);
	for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
	if (!Src.getOperand(Idx).isUndef() &&
	Src.getConstantOperandAPInt(Idx).isNegative())
	Imm.setBit(Idx);
	}
	return DAG.getConstant(Imm, SDLoc(N), VT);
	}

	// Look through int->fp bitcasts that don't change the element width.
	unsigned EltWidth = SrcVT.getScalarSizeInBits();
	if (Src.getOpcode() == ISD::BITCAST &&
	Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
	return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

	// Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
	// with scalar comparisons.
	if (SDValue NotSrc = IsNOT(Src, DAG)) {
	SDLoc DL(N);
	APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
	NotSrc = DAG.getBitcast(SrcVT, NotSrc);
	return DAG.getNode(ISD::XOR, DL, VT,
	DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
	DAG.getConstant(NotMask, DL, VT));
	}

	// Simplify the inputs.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(NumBits));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	if (DCI.isBeforeLegalizeOps()) {
	SDValue Index = N->getOperand(4);
	// Remove any sign extends from 32 or smaller to larger than 32.
	// Only do this before LegalizeOps in case we need the sign extend for
	// legalization.
	if (Index.getOpcode() == ISD::SIGN_EXTEND) {
	if (Index.getScalarValueSizeInBits() > 32 &&
	Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
	if (Res == N) {
	// The original sign extend has less users, add back to worklist in
	// case it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	}
	return SDValue(Res, 0);
	}
	}

	// Make sure the index is either i32 or i64
	unsigned ScalarSize = Index.getScalarValueSizeInBits();
	if (ScalarSize != 32 && ScalarSize != 64) {
	MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
	EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	Index.getValueType().getVectorNumElements());
	Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index;
	SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
	if (Res == N)
	DCI.AddToWorklist(N);
	return SDValue(Res, 0);
	}

	// Try to remove zero extends from 32->64 if we know the sign bit of
	// the input is zero.
	if (Index.getOpcode() == ISD::ZERO_EXTEND &&
	Index.getScalarValueSizeInBits() == 64 &&
	Index.getOperand(0).getScalarValueSizeInBits() == 32) {
	if (DAG.SignBitIsZero(Index.getOperand(0))) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
	if (Res == N) {
	// The original sign extend has less users, add back to worklist in
	// case it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	}
	return SDValue(Res, 0);
	}
	}
	}

	// With AVX2 we only demand the upper bit of the mask.
	if (!Subtarget.hasAVX512()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Mask = N->getOperand(2);
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
	return SDValue(N, 0);
	}

	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
	SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	/// If we are converting a value to floating-point, try to replace scalar
	/// truncate of an extracted vector element with a bitcast. This tries to keep
	/// the sequence on XMM registers rather than moving between vector and GPRs.
	static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
	// TODO: This is currently only used by combineSIntToFP, but it is generalized
	// to allow being called by any similar cast opcode.
	// TODO: Consider merging this into lowering: vectorizeExtractedCast().
	SDValue Trunc = N->getOperand(0);
	if (!Trunc.hasOneUse() \|\| Trunc.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue ExtElt = Trunc.getOperand(0);
	if (!ExtElt.hasOneUse() \|\| ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isNullConstant(ExtElt.getOperand(1)))
	return SDValue();

	EVT TruncVT = Trunc.getValueType();
	EVT SrcVT = ExtElt.getValueType();
	unsigned DestWidth = TruncVT.getSizeInBits();
	unsigned SrcWidth = SrcVT.getSizeInBits();
	if (SrcWidth % DestWidth != 0)
	return SDValue();

	// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
	EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
	unsigned VecWidth = SrcVecVT.getSizeInBits();
	unsigned NumElts = VecWidth / DestWidth;
	EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
	SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
	SDLoc DL(N);
	SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
	BitcastVec, ExtElt.getOperand(1));
	return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
	Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
	EVT LdVT = Ld->getValueType(0);

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	// If we have AVX512DQ we can use packed conversion instructions unless
	// the VT is f80.
	if (Subtarget.hasDQI() && VT != MVT::f80)
	return SDValue();

	if (!Ld->isVolatile() && !VT.isVector() &&
	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
	!Subtarget.is64Bit() && LdVT == MVT::i64) {
	SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
	return FILDChain;
	}
	}

	if (SDValue V = combineToFPTruncExtElt(N, DAG))
	return V;

	return SDValue();
	}

	static bool needCarryOrOverflowFlag(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	X86::CondCode CC;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return true;
	case X86ISD::SETCC:
	case X86ISD::SETCC_CARRY:
	CC = (X86::CondCode)User->getConstantOperandVal(0);
	break;
	case X86ISD::BRCOND:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	case X86ISD::CMOV:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	}

	switch (CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	return true;
	}
	}

	return false;
	}

	static bool onlyZeroFlagUsed(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	unsigned CCOpNo;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return false;
	case X86ISD::SETCC: CCOpNo = 0; break;
	case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
	case X86ISD::BRCOND: CCOpNo = 2; break;
	case X86ISD::CMOV: CCOpNo = 2; break;
	}

	X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return false;
	}

	return true;
	}

	static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
	// Only handle test patterns.
	if (!isNullConstant(N->getOperand(1)))
	return SDValue();

	// If we have a CMP of a truncated binop, see if we can make a smaller binop
	// and use its flags directly.
	// TODO: Maybe we should try promoting compares that only use the zero flag
	// first if we can prove the upper bits with computeKnownBits?
	SDLoc dl(N);
	SDValue Op = N->getOperand(0);
	EVT VT = Op.getValueType();

	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if ((Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SHL) &&
	Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
	onlyZeroFlagUsed(SDValue(N, 0))) {
	unsigned BitWidth = VT.getSizeInBits();
	const APInt &ShAmt = Op.getConstantOperandAPInt(1);
	if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
	unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
	APInt Mask = Op.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, MaskBits)
	: APInt::getLowBitsSet(BitWidth, MaskBits);
	if (Mask.isSignedIntN(32)) {
	Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));
	}
	}
	}

	// Look for a truncate with a single use.
	if (Op.getOpcode() != ISD::TRUNCATE \|\| !Op.hasOneUse())
	return SDValue();

	Op = Op.getOperand(0);

	// Arithmetic op can only have one use.
	if (!Op.hasOneUse())
	return SDValue();

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default: return SDValue();
	case ISD::AND:
	// Skip and with constant. We have special handling for and with immediate
	// during isel to generate test instructions.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	NewOpc = X86ISD::AND;
	break;
	case ISD::OR: NewOpc = X86ISD::OR; break;
	case ISD::XOR: NewOpc = X86ISD::XOR; break;
	case ISD::ADD:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::ADD;
	break;
	case ISD::SUB:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::SUB;
	break;
	}

	// We found an op we can narrow. Truncate its inputs.
	SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

	// Use a X86 specific opcode to avoid DAG combine messing with it.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

	// For AND, keep a CMP so that we can match the test pattern.
	if (NewOpc == X86ISD::AND)
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));

	// Return the flags.
	return Op.getValue(1);
	}

	static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert((X86ISD::ADD == N->getOpcode() \|\| X86ISD::SUB == N->getOpcode()) &&
	"Expected X86ISD::ADD or X86ISD::SUB");

	SDLoc DL(N);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	MVT VT = LHS.getSimpleValueType();
	unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;

	// If we don't use the flag result, simplify back to a generic ADD/SUB.
	if (!N->hasAnyUseOfValue(1)) {
	SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
	return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
	}

	// Fold any similar generic ADD/SUB opcodes to reuse this node.
	auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
	SDValue Ops[] = {N0, N1};
	SDVTList VTs = DAG.getVTList(N->getValueType(0));
	if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
	SDValue Op(N, 0);
	if (Negate)
	Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
	DCI.CombineTo(GenericAddSub, Op);
	}
	};
	MatchGeneric(LHS, RHS, false);
	MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

	return SDValue();
	}

	static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
	// iff the flag result is dead.
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
	!N->hasAnyUseOfValue(1))
	return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
	Op0.getOperand(1), N->getOperand(2));

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL,
	MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> adc X, 0
	// X - SETB Z --> sbb X, 0
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), Y.getOperand(1));
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y->getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), NewEFLAGS);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1);

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1);
	}

	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	EVT VT = N->getValueType(0);

	// If the vector size is less than 128, or greater than the supported RegSize,
	// do not use PMADD.
	if (!VT.isVector() \|\| VT.getVectorNumElements() < 8)
	return SDValue();

	if (Op0.getOpcode() != ISD::MUL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() != ISD::MUL)
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	SDLoc DL(N);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	VT.getVectorNumElements() / 2);

	// Madd vector size is half of the original vector size
	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
	return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
	};

	auto BuildPMADDWD = [&](SDValue Mul) {
	// Shrink the operands of mul.
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));

	SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
	PMADDWDBuilder);
	// Fill the rest of the output with 0
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
	DAG.getConstant(0, DL, MAddVT));
	};

	Op0 = BuildPMADDWD(Op0);

	// It's possible that Op1 is also a mul we can reduce.
	if (Op1.getOpcode() == ISD::MUL &&
	canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
	Op1 = BuildPMADDWD(Op1);
	}

	return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
	}

	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.useBWIRegs())
	RegSize = 512;
	else if (Subtarget.hasAVX())
	RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();

	// We know N is a reduction add, which means one of its operands is a phi.
	// To match SAD, we need the other operand to be a ABS.
	if (Op0.getOpcode() != ISD::ABS)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() != ISD::ABS)
	return SDValue();

	auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
	// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);

	// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
	// anyway.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	if (VT.getSizeInBits() >= ResVT.getSizeInBits())
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
	else
	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Fill the upper elements with zero to match the add width.
	SDValue Zero = DAG.getConstant(0, DL, VT);
	Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
	DAG.getIntPtrConstant(0, DL));
	}

	return Sad;
	};

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue SadOp0, SadOp1;
	if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
	return SDValue();

	Op0 = BuildPSADBW(SadOp0, SadOp1);

	// It's possible we have a sad on the other side too.
	if (Op1.getOpcode() == ISD::ABS &&
	detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
	Op1 = BuildPSADBW(SadOp0, SadOp1);
	}

	return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
	}

	/// Convert vector increment or decrement to sub/add with an all-ones constant:
	/// add X, <1, 1...> --> sub X, <-1, -1...>
	/// sub X, <1, 1...> --> add X, <-1, -1...>
	/// The all-ones vector constant can be materialized using a pcmpeq instruction
	/// that is commonly recognized as an idiom (has no register dependency), so
	/// that's better/smaller than loading a splat 1 constant.
	static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Unexpected opcode for increment/decrement transform");

	// Pseudo-legality check: getOnesVector() expects one of these types, so bail
	// out and wait for legalization if we have an unsupported vector length.
	EVT VT = N->getValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	APInt SplatVal;
	if (!isConstantSplat(N->getOperand(1), SplatVal) \|\| !SplatVal.isOneValue())
	return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}

	static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	// Example of pattern we try to detect:
	// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
	//(add (build_vector (extract_elt t, 0),
	// (extract_elt t, 2),
	// (extract_elt t, 4),
	// (extract_elt t, 6)),
	// (build_vector (extract_elt t, 1),
	// (extract_elt t, 3),
	// (extract_elt t, 5),
	// (extract_elt t, 7)))

	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Op0.getOpcode() != ISD::BUILD_VECTOR \|\|
	Op1.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	// Check if one of Op0,Op1 is of the form:
	// (build_vector (extract_elt Mul, 0),
	// (extract_elt Mul, 2),
	// (extract_elt Mul, 4),
	// ...
	// the other is of the form:
	// (build_vector (extract_elt Mul, 1),
	// (extract_elt Mul, 3),
	// (extract_elt Mul, 5),
	// ...
	// and identify Mul.
	SDValue Mul;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
	SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
	Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
	// TODO: Be more tolerant to undefs.
	if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
	auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
	auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
	auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
	if (!Const0L \|\| !Const1L \|\| !Const0H \|\| !Const1H)
	return SDValue();
	unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
	Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
	// Commutativity of mul allows factors of a product to reorder.
	if (Idx0L > Idx1L)
	std::swap(Idx0L, Idx1L);
	if (Idx0H > Idx1H)
	std::swap(Idx0H, Idx1H);
	// Commutativity of add allows pairs of factors to reorder.
	if (Idx0L > Idx0H) {
	std::swap(Idx0L, Idx0H);
	std::swap(Idx1L, Idx1H);
	}
	if (Idx0L != 2 * i \|\| Idx1L != 2 * i + 1 \|\| Idx0H != 2 * i + 2 \|\|
	Idx1H != 2 * i + 3)
	return SDValue();
	if (!Mul) {
	// First time an extract_elt's source vector is visited. Must be a MUL
	// with 2X number of vector elements than the BUILD_VECTOR.
	// Both extracts must be from same MUL.
	Mul = Op0L->getOperand(0);
	if (Mul->getOpcode() != ISD::MUL \|\|
	Mul.getValueType().getVectorNumElements() != 2 * e)
	return SDValue();
	}
	// Check that the extract is from the same MUL previously seen.
	if (Mul != Op0L->getOperand(0) \|\| Mul != Op1L->getOperand(0) \|\|
	Mul != Op0H->getOperand(0) \|\| Mul != Op1H->getOperand(0))
	return SDValue();
	}

	// Check if the Mul source can be safely shrunk.
	ShrinkMode Mode;
	if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT InVT = Ops[0].getValueType();
	assert(InVT.getScalarType() == MVT::i32 &&
	"Unexpected scalar element type");
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements() / 2);
	EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	InVT.getVectorNumElements());
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
	DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
	DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT,
	{ Mul.getOperand(0), Mul.getOperand(1) },
	PMADDBuilder);
	}

	// Attempt to turn this pattern into PMADDWD.
	// (mul (add (zext (build_vector)), (zext (build_vector))),
	// (add (zext (build_vector)), (zext (build_vector)))
	static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// All inputs need to be sign extends.
	// TODO: Support ZERO_EXTEND from known positive?
	if (N00.getOpcode() != ISD::SIGN_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::SIGN_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Must be extending from vXi16.
	EVT InVT = N00.getValueType();
	if (InVT.getVectorElementType() != MVT::i16 \|\| N01.getValueType() != InVT \|\|
	N10.getValueType() != InVT \|\| N11.getValueType() != InVT)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue In0, In1;
	for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!In0) {
	In0 = N00In;
	In1 = N01In;
	}
	// Mul is commutative so the input vectors can be in any order.
	// Canonicalize to make the compares easier.
	if (In0 != N00In)
	std::swap(N00In, N01In);
	if (In0 != N10In)
	std::swap(N10In, N11In);
	if (In0 != N00In \|\| In1 != N01In \|\| In0 != N10In \|\| In1 != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT OpVT = Ops[0].getValueType();
	assert(OpVT.getScalarType() == MVT::i16 &&
	"Unexpected scalar element type");
	assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	OpVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
	PMADDBuilder);
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;
	}
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;
	if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;

	// Try to synthesize horizontal adds from adds of shuffles.
	if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
	VT == MVT::v8i32) &&
	Subtarget.hasSSSE3() &&
	isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
	auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
	HADDBuilder);
	}

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// PSUBUS is supported, starting from SSE2, but truncation for v8i32
	// is only worth it with SSSE3 (PSHUFB).
	if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) &&
	!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 \|\| VT == MVT::v8i64)) &&
	!(Subtarget.hasAVX() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)) &&
	!(Subtarget.useBWIRegs() && (VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\|
	VT == MVT::v16i32 \|\| VT == MVT::v8i64)))
	return SDValue();

	SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).
	// TODO: Need to add IR canonicalization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)
	SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)
	SubusRHS = MinRHS;
	else if (MinRHS == Op0)
	SubusRHS = MinLHS;
	else
	return SDValue();
	} else
	return SDValue();

	auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
	};

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.
	if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
	{ SubusLHS, SubusRHS }, USUBSATBuilder);

	// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,
	// so we require first 16 bits to be zeros for 32 bit
	// values, or first 48 bits for 64 bit values.
	KnownBits Known = DAG.computeKnownBits(SubusLHS);
	unsigned NumZeros = Known.countMinLeadingZeros();
	if ((VT == MVT::v8i64 && NumZeros < 48) \|\| NumZeros < 16)
	return SDValue();

	EVT ExtType = SubusLHS.getValueType();
	EVT ShrinkedType;
	if (VT == MVT::v8i32 \|\| VT == MVT::v8i64)
	ShrinkedType = MVT::v8i16;
	else
	ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

	// If SubusLHS is zeroextended - truncate SubusRHS to it's
	// size SubusRHS = umin(0xFFF.., SubusRHS).
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
	ShrinkedType.getScalarSizeInBits()),
	SDLoc(SubusLHS), ExtType);
	SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
	SaturationConst);
	SDValue NewSubusLHS =
	DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
	SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
	SDValue Psubus =
	SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
	{ NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
	// Zero extend the result, it may be used somewhere as 32 bit,
	// if not zext and following trunc will shrink.
	return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	const APInt &XorC = Op1.getConstantOperandAPInt(1);
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	EVT VT = N->getValueType(0);
	if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
	VT == MVT::v8i32) &&
	Subtarget.hasSSSE3() &&
	isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
	auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
	HSUBBuilder);
	}

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	// Try to create PSUBUS if SUB's argument is max/min
	if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return DAG.getConstant(-1, DL, VT);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return DAG.getConstant(0, DL, VT);
	}

	return SDValue();
	}

	/// Helper that combines an array of subvector ops as if they were the operands
	/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
	/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
	static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
	ArrayRef<SDValue> Ops, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");

	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	if (llvm::all_of(Ops, [](SDValue Op) {
	return ISD::isBuildVectorAllZeros(Op.getNode());
	}))
	return getZeroVector(VT, Subtarget, DAG, DL);

	SDValue Op0 = Ops[0];

	// Fold subvector loads into one.
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
	bool Fast;
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*FirstLd->getMemOperand(), &Fast) &&
	Fast) {
	if (SDValue Ld =
	EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
	return Ld;
	}
	}

	// Repeated subvectors.
	if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
	// If this broadcast/subv_broadcast is inserted into both halves, use a
	// larger broadcast/subv_broadcast.
	if (Op0.getOpcode() == X86ISD::VBROADCAST \|\|
	Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
	return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

	// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
	if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
	(Subtarget.hasAVX2() \|\| MayFoldLoad(Op0.getOperand(0))))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
	Op0.getOperand(0),
	DAG.getIntPtrConstant(0, DL)));

	// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
	if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Subtarget.hasAVX2() \|\|
	(VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
	Op0.getOperand(0).getValueType() == VT.getScalarType())
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
	}

	bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

	// Repeated opcode.
	// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
	// but it currently struggles with different vector widths.
	if (llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op.getOpcode() == Op0.getOpcode();
	})) {
	unsigned NumOps = Ops.size();
	switch (Op0.getOpcode()) {
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFD:
	if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
	Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	LLVM_FALLTHROUGH;
	case X86ISD::VPERMILPI:
	// TODO - add support for vXf64/vXi64 shuffles.
	if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 \|\| VT == MVT::v8i32) &&
	Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
	Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
	Op0.getOperand(1));
	return DAG.getBitcast(VT, Res);
	}
	break;
	case X86ISD::PACKUS:
	if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
	SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
	NumOps * SrcVT.getVectorNumElements());
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
	}
	break;
	}
	}

	// If we're inserting all zeros into the upper half, change this to
	// an insert into an all zeros vector. We will match this to a move
	// with implicit upper bit zeroing during isel.
	if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
	DAG.getIntPtrConstant(0, DL));

	return SDValue();
	}

	static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Don't do anything for i1 vectors.
	if (VT.getVectorElementType() == MVT::i1)
	return SDValue();

	if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
	SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
	if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
	DCI, Subtarget))
	return R;
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);

	bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);

	uint64_t IdxVal = N->getConstantOperandVal(2);
	MVT SubVecVT = SubVec.getSimpleValueType();

	if (Vec.isUndef() && SubVec.isUndef())
	return DAG.getUNDEF(OpVT);

	// Inserting undefs/zeros into zeros/undefs is a zero vector.
	if ((Vec.isUndef() \|\| ISD::isBuildVectorAllZeros(Vec.getNode())) &&
	(SubVec.isUndef() \|\| ISD::isBuildVectorAllZeros(SubVec.getNode())))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// If we're inserting into a zero vector and then into a larger zero vector,
	// just insert into the larger zero vector directly.
	if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
	uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	SubVec.getOperand(1),
	DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
	}

	// If we're inserting into a zero vector and our input was extracted from an
	// insert into a zero vector of the same type and the extraction was at
	// least as large as the original insertion. Just insert the original
	// subvector into a zero vector.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
	SubVec.getConstantOperandAPInt(1) == 0 &&
	SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Ins = SubVec.getOperand(0);
	if (Ins.getConstantOperandAPInt(2) == 0 &&
	ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
	Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	Ins.getOperand(1), N->getOperand(2));
	}
	}

	// Stop here if this is an i1 vector.
	if (IsI1Vector)
	return SDValue();

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subregister operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\| !Vec.isUndef())) {
	int ExtIdxVal = SubVec.getConstantOperandVal(1);
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Match concat_vector style patterns.
	SmallVector<SDValue, 2> SubVectorOps;
	if (collectConcatOps(N, SubVectorOps))
	if (SDValue Fold =
	combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
	return Fold;

	// If we are inserting into both halves of the vector, the starting vector
	// should be undef. If it isn't, make it so. Only do this if the early insert
	// has no other uses.
	// TODO: Should this be a generic DAG combine?
	// TODO: Why doesn't SimplifyDemandedVectorElts catch this?
	if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
	Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
	isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
	Vec.getOperand(1).getValueSizeInBits() == SubVecVT.getSizeInBits() &&
	Vec.hasOneUse()) {
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
	Vec.getOperand(1), Vec.getOperand(2));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
	N->getOperand(2));
	}

	// If this is a broadcast insert into an upper undef, use a larger broadcast.
	if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
	return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

	return SDValue();
	}

	/// If we are extracting a subvector of a vector select and the select condition
	/// is composed of concatenated vectors, try to narrow the select width. This
	/// is a common pattern for AVX1 integer code because 256-bit selects may be
	/// legal, but there is almost no integer math/logic available for 256-bit.
	/// This function should only be called with legal types (otherwise, the calls
	/// to get simple value types will assert).
	static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
	SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
	SmallVector<SDValue, 4> CatOps;
	if (Sel.getOpcode() != ISD::VSELECT \|\|
	!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
	return SDValue();

	// Note: We assume simple value types because this should only be called with
	// legal operations/types.
	// TODO: This can be extended to handle extraction to 256-bits.
	MVT VT = Ext->getSimpleValueType(0);
	if (!VT.is128BitVector())
	return SDValue();

	MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
	if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
	return SDValue();

	MVT WideVT = Ext->getOperand(0).getSimpleValueType();
	MVT SelVT = Sel.getSimpleValueType();
	assert((SelVT.is256BitVector() \|\| SelVT.is512BitVector()) &&
	"Unexpected vector type with legal operations");

	unsigned SelElts = SelVT.getVectorNumElements();
	unsigned CastedElts = WideVT.getVectorNumElements();
	unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
	if (SelElts % CastedElts == 0) {
	// The select has the same or more (narrower) elements than the extract
	// operand. The extraction index gets scaled by that factor.
	ExtIdx *= (SelElts / CastedElts);
	} else if (CastedElts % SelElts == 0) {
	// The select has less (wider) elements than the extract operand. Make sure
	// that the extraction index can be divided evenly.
	unsigned IndexDivisor = CastedElts / SelElts;
	if (ExtIdx % IndexDivisor != 0)
	return SDValue();
	ExtIdx /= IndexDivisor;
	} else {
	llvm_unreachable("Element count of simple vector types are not divisible?");
	}

	unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
	unsigned NarrowElts = SelElts / NarrowingFactor;
	MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
	SDLoc DL(Ext);
	SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
	SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
	SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
	SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
	return DAG.getBitcast(VT, NarrowSel);
	}

	static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// For AVX1 only, if we are extracting from a 256-bit and+not (which will
	// eventually get combined/lowered into ANDNP) with a concatenated operand,
	// split the 'and' into 128-bit ops to avoid the concatenate and extract.
	// We let generic combining take over from there to simplify the
	// insert/extract and 'not'.
	// This pattern emerges during AVX1 legalization. We handle it before lowering
	// to avoid complications like splitting constant vector loads.

	// Capture the original wide type in the likely case that we need to bitcast
	// back to this type.
	if (!N->getValueType(0).isSimple())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	EVT WideVecVT = N->getOperand(0).getValueType();
	SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
	TLI.isTypeLegal(WideVecVT) &&
	WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
	auto isConcatenatedNot = [] (SDValue V) {
	V = peekThroughBitcasts(V);
	if (!isBitwiseNot(V))
	return false;
	SDValue NotOp = V->getOperand(0);
	return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
	};
	if (isConcatenatedNot(WideVec.getOperand(0)) \|\|
	isConcatenatedNot(WideVec.getOperand(1))) {
	// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
	SDValue Concat = split256IntArith(WideVec, DAG);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
	DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
	}
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = narrowExtractedVectorSelect(N, DAG))
	return V;

	SDValue InVec = N->getOperand(0);
	unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

	if (ISD::isBuildVectorAllZeros(InVec.getNode()))
	return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

	if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
	if (VT.getScalarType() == MVT::i1)
	return DAG.getConstant(1, SDLoc(N), VT);
	return getOnesVector(VT, DAG, SDLoc(N));
	}

	if (InVec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	VT, SDLoc(N),
	InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));

	// Try to move vector bitcast after extract_subv by scaling extraction index:
	// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
	// TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
	if (InVec.getOpcode() == ISD::BITCAST &&
	InVec.getOperand(0).getValueType().isVector()) {
	SDValue SrcOp = InVec.getOperand(0);
	EVT SrcVT = SrcOp.getValueType();
	unsigned SrcNumElts = SrcVT.getVectorNumElements();
	unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
	if ((DestNumElts % SrcNumElts) == 0) {
	unsigned DestSrcRatio = DestNumElts / SrcNumElts;
	if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
	unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
	EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
	SrcVT.getScalarType(), NewExtNumElts);
	if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
	TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
	unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
	SDLoc DL(N);
	SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
	SrcOp, NewIndex);
	return DAG.getBitcast(VT, NewExtract);
	}
	}
	}
	}

	// If we're extracting from a broadcast then we're better off just
	// broadcasting to the smaller type directly, assuming this is the only use.
	// As its a broadcast we don't care about the extraction index.
	if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
	InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
	return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));

	// If we're extracting the lowest subvector and we're the only user,
	// we may be able to perform this with a smaller vector width.
	if (IdxVal == 0 && InVec.hasOneUse()) {
	unsigned InOpcode = InVec.getOpcode();
	if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
	// v2f64 CVTDQ2PD(v4i32).
	if (InOpcode == ISD::SINT_TO_FP &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTUDQ2PD(v4i32).
	if (InOpcode == ISD::UINT_TO_FP &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTPS2PD(v4f32).
	if (InOpcode == ISD::FP_EXTEND &&
	InVec.getOperand(0).getValueType() == MVT::v4f32) {
	return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
	}
	}
	if ((InOpcode == ISD::ANY_EXTEND \|\|
	InOpcode == ISD::ANY_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::ZERO_EXTEND \|\|
	InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::SIGN_EXTEND \|\|
	InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	VT.is128BitVector() &&
	InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
	unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
	return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
	}
	if (InOpcode == ISD::VSELECT &&
	InVec.getOperand(0).getValueType().is256BitVector() &&
	InVec.getOperand(1).getValueType().is256BitVector() &&
	InVec.getOperand(2).getValueType().is256BitVector()) {
	SDLoc DL(N);
	SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
	SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
	SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
	return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
	}
	}

	return SDValue();
	}

	static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
	// This occurs frequently in our masked scalar intrinsic code and our
	// floating point select lowering with AVX512.
	// TODO: SimplifyDemandedBits instead?
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->getAPIntValue().isOneValue())
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
	Src.getOperand(0));

	// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
	Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->isNullValue())
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
	Src.getOperand(1));

	// Reduce v2i64 to v4i32 if we don't need the upper bits.
	// TODO: Move to DAGCombine?
	if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
	Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
	Src.getOperand(0).getScalarValueSizeInBits() <= 32)
	return DAG.getBitcast(
	VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
	DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));

	return SDValue();
	}

	// Simplify PMULDQ and PMULUDQ operations.
	static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
	- TargetLowering::DAGCombinerInfo &DCI) {
	+ TargetLowering::DAGCombinerInfo &DCI,
	+ const X86Subtarget &Subtarget) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	// Canonicalize constant to RHS.
	if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(RHS))
	return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

	// Multiply by zero.
	+ // Don't return RHS as it may contain UNDEFs.
	if (ISD::isBuildVectorAllZeros(RHS.getNode()))
	- return RHS;
	+ return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));

	// Aggressively peek through ops to get at the demanded low bits.
	APInt DemandedMask = APInt::getLowBitsSet(64, 32);
	SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
	SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
	if (DemandedLHS \|\| DemandedRHS)
	return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
	DemandedLHS ? DemandedLHS : LHS,
	DemandedRHS ? DemandedRHS : RHS);

	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Try to merge vector loads and extend_inreg to an extload.
	if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
	In.hasOneUse()) {
	auto *Ld = cast<LoadSDNode>(In);
	if (!Ld->isVolatile()) {
	MVT SVT = In.getSimpleValueType().getVectorElementType();
	ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
	EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
	VT.getVectorNumElements());
	if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
	SDValue Load =
	DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	return Load;
	}
	}
	}

	// Disabling for widening legalization for now. We can enable if we find a
	// case that needs it. Otherwise it can be deleted when we switch to
	// widening legalization.
	if (ExperimentalVectorWideningLegalization)
	return SDValue();

	// Combine (ext_invec (ext_invec X)) -> (ext_invec X)
	if (In.getOpcode() == N->getOpcode() &&
	TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));

	// Attempt to combine as a shuffle.
	// TODO: SSE41 support
	if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
	SDValue Op(N, 0);
	if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::SCALAR_TO_VECTOR:
	return combineScalarToVector(N, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case ISD::CONCAT_VECTORS:
	return combineConcatVectors(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::EXTRACT_SUBVECTOR:
	return combineExtractSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case X86ISD::CMP: return combineCMP(N, DAG);
	case ISD::ADD: return combineAdd(N, DAG, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, Subtarget);
	case X86ISD::ADD:
	case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
	case X86ISD::SBB: return combineSBB(N, DAG);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL: return combineShiftLeft(N, DAG);
	case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
	case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
	case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
	case X86ISD::CVTP2SI:
	case X86ISD::CVTP2UI:
	case X86ISD::CVTTP2SI:
	case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
	Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
	case X86ISD::VSHL:
	case X86ISD::VSRA:
	case X86ISD::VSRL:
	return combineVectorShiftVar(N, DAG, DCI, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VBROADCAST:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB:
	case X86ISD::FNMSUB_RND:
	case ISD::FMA: return combineFMA(N, DAG, Subtarget);
	case X86ISD::FMADDSUB_RND:
	case X86ISD::FMSUBADD_RND:
	case X86ISD::FMADDSUB:
	case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
	case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
	case X86ISD::MGATHER:
	case X86ISD::MSCATTER:
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	case X86ISD::PMULDQ:
	- case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
	+ case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
	}

	return SDValue();
	}

	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;

	// There are no vXi8 shifts.
	if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
	return false;

	// TODO: Almost no 8-bit ops are desirable because they have no actual
	// size/speed advantages vs. 32-bit ops, but they do have a major
	// potential disadvantage by causing partial register stalls.
	//
	// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
	// we have specializations to turn 32-bit multiply/shl into LEA or other ops.
	// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
	// check for a constant operand to the multiply.
	if ((Opc == ISD::MUL \|\| Opc == ISD::SHL) && VT == MVT::i8)
	return false;

	// i16 instruction encodings are longer and some i16 instructions are slow,
	// so those are not desirable.
	if (VT == MVT::i16) {
	switch (Opc) {
	default:
	break;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	// Any legal type not explicitly accounted for above here is desirable.
	return true;
	}

	SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
	SDValue Value, SDValue Addr,
	SelectionDAG &DAG) const {
	const Module *M = DAG.getMachineFunction().getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
	if (IsCFProtectionSupported) {
	// In case control-flow branch protection is enabled, we need to add
	// notrack prefix to the indirect branch.
	// In order to do that we create NT_BRIND SDNode.
	// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
	return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
	}

	return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
	}

	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
	isa<ConstantSDNode>(Op.getOperand(1));

	// i16 is legal, but undesirable since i16 instruction encodings are longer
	// and some i16 instructions are slow.
	// 8-bit multiply-by-constant can usually be expanded to something cheaper
	// using LEA and/or other ALU ops.
	if (VT != MVT::i16 && !Is8BitMulByConstant)
	return false;

	auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (!ISD::isNormalStore(User))
	return false;
	auto *Ld = cast<LoadSDNode>(Load);
	auto *St = cast<StoreSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
	if (!Load.hasOneUse() \|\| Load.getOpcode() != ISD::ATOMIC_LOAD)
	return false;
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (User->getOpcode() != ISD::ATOMIC_STORE)
	return false;
	auto *Ld = cast<AtomicSDNode>(Load);
	auto *St = cast<AtomicSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	bool Commute = false;
	switch (Op.getOpcode()) {
	default: return false;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
	return false;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N1) &&
	(!Commute \|\| !isa<ConstantSDNode>(N0) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
	return false;
	if (MayFoldLoad(N0) &&
	((Commute && !isa<ConstantSDNode>(N1)) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
	return false;
	if (IsFoldableAtomicRMW(N0, Op) \|\|
	(Commute && IsFoldableAtomicRMW(N1, Op)))
	return false;
	}
	}

	PVT = MVT::i32;
	return true;
	}

	bool X86TargetLowering::
	isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {

	assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
	"Element count mismatch");
	assert(
	Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
	"Shuffle Mask expected to be legal");

	// For 32-bit elements VPERMD is better than shuffle+truncate.
	// TODO: After we improve lowerBuildVector, add execption for VPERMW.
	if (SrcVT.getScalarSizeInBits() == 32 \|\| !Subtarget.hasAVX2())
	return false;

	if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
	return false;

	return true;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
	X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
	.Case("{@cca}", X86::COND_A)
	.Case("{@ccae}", X86::COND_AE)
	.Case("{@ccb}", X86::COND_B)
	.Case("{@ccbe}", X86::COND_BE)
	.Case("{@ccc}", X86::COND_B)
	.Case("{@cce}", X86::COND_E)
	.Case("{@ccz}", X86::COND_E)
	.Case("{@ccg}", X86::COND_G)
	.Case("{@ccge}", X86::COND_GE)
	.Case("{@ccl}", X86::COND_L)
	.Case("{@ccle}", X86::COND_LE)
	.Case("{@ccna}", X86::COND_BE)
	.Case("{@ccnae}", X86::COND_B)
	.Case("{@ccnb}", X86::COND_AE)
	.Case("{@ccnbe}", X86::COND_A)
	.Case("{@ccnc}", X86::COND_AE)
	.Case("{@ccne}", X86::COND_NE)
	.Case("{@ccnz}", X86::COND_NE)
	.Case("{@ccng}", X86::COND_LE)
	.Case("{@ccnge}", X86::COND_L)
	.Case("{@ccnl}", X86::COND_GE)
	.Case("{@ccnle}", X86::COND_G)
	.Case("{@ccno}", X86::COND_NO)
	.Case("{@ccnp}", X86::COND_P)
	.Case("{@ccns}", X86::COND_NS)
	.Case("{@cco}", X86::COND_O)
	.Case("{@ccp}", X86::COND_P)
	.Case("{@ccs}", X86::COND_S)
	.Default(X86::COND_INVALID);
	return Cond;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'Y':
	case 'l':
	case 'k': // AVX512 masking registers.
	return C_RegisterClass;
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'N':
	case 'G':
	case 'L':
	case 'M':
	return C_Immediate;
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'z':
	case '0':
	return C_Register;
	case 'i':
	case 'm':
	case 'k':
	case 't':
	case '2':
	return C_RegisterClass;
	}
	}
	} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return C_Other;
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y': {
	unsigned Size = StringRef(constraint).size();
	// Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
	char NextChar = Size == 2 ? constraint[1] : 'i';
	if (Size > 2)
	break;
	switch (NextChar) {
	default:
	return CW_Invalid;
	// XMM0
	case 'z':
	case '0':
	if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
	return CW_SpecificReg;
	return CW_Invalid;
	// Conditional OpMask regs (AVX512)
	case 'k':
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	return CW_Register;
	return CW_Invalid;
	// Any MMX reg
	case 'm':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	return weight;
	return CW_Invalid;
	// Any SSE reg when ISA >= SSE2, same as 'Y'
	case 'i':
	case 't':
	case '2':
	if (!Subtarget.hasSSE2())
	return CW_Invalid;
	break;
	}
	// Fall through (handle "Y" constraint).
	LLVM_FALLTHROUGH;
	}
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	weight = CW_Register;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE2())
	return "Y";
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	// Lower @cc targets via setcc.
	SDValue X86TargetLowering::LowerAsmOutputForConstraint(
	SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
	SelectionDAG &DAG) const {
	X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
	if (Cond == X86::COND_INVALID)
	return SDValue();
	// Check that return type is valid.
	if (OpInfo.ConstraintVT.isVector() \|\| !OpInfo.ConstraintVT.isInteger() \|\|
	OpInfo.ConstraintVT.getSizeInBits() < 8)
	report_fatal_error("Flag output operand is of invalid type");

	// Get EFLAGS register. Only update chain when copyfrom is glued.
	if (Flag.getNode()) {
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
	Chain = Flag.getValue(1);
	} else
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
	// Extract CC code.
	SDValue CC = getSETCC(Cond, Flag, DL, DAG);
	// Extend to 32-bits
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

	return Result;
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
	BooleanContent BCont = getBooleanContents(MVT::i64);
	ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
	: ISD::SIGN_EXTEND;
	int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
	: CST->getSExtValue();
	Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(
	Subtarget.classifyGlobalReference(GA->getGlobal())))
	return;
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	/// Check if \p RC is a mask register class.
	/// I.e., VK* or one of their variant.
	static bool isVKClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::VK1RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK2RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK4RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK8RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK16RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK32RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK64RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// 'A' means [ER]AX + [ER]DX.
	case 'A':
	if (Subtarget.is64Bit())
	return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1RegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16RegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32RegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i64 \|\| VT == MVT::f64)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	return std::make_pair(0U, &X86::GR64RegClass);
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	return std::make_pair(0U, &X86::RFP80RegClass);
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'Y': // SSE_REGS if SSE2 allowed
	if (!Subtarget.hasSSE2()) break;
	LLVM_FALLTHROUGH;
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	// Vector types.
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	if (Subtarget.hasAVX())
	return std::make_pair(0U, &X86::VR256RegClass);
	break;
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	if (!Subtarget.hasAVX512()) break;
	if (VConstraint)
	return std::make_pair(0U, &X86::VR512RegClass);
	return std::make_pair(0U, &X86::VR512_0_15RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'i':
	case 't':
	case '2':
	return getRegForInlineAsmConstraint(TRI, "Y", VT);
	case 'm':
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'z':
	case '0':
	if (!Subtarget.hasSSE1()) break;
	return std::make_pair(X86::XMM0, &X86::VR128RegClass);
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1WMRegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8WMRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16WMRegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32WMRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	break;
	}
	}

	if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return std::make_pair(0U, &X86::GR32RegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' && Constraint[6] == '}') {
	// st(7) is not allocatable and thus not a member of RFP80. Return
	// singleton class in cases where we have a reference to it.
	if (Constraint[4] == '7')
	return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
	return std::make_pair(X86::FP0 + Constraint[4] - '0',
	&X86::RFP80RegClass);
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint))
	return std::make_pair(X86::FP0, &X86::RFP80RegClass);

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint))
	return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

	// dirflag -> DF
	if (StringRef("{dirflag}").equals_lower(Constraint))
	return std::make_pair(X86::DF, &X86::DFCCRRegClass);

	// fpsr -> FPSW
	if (StringRef("{fpsr}").equals_lower(Constraint))
	return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

	return Res;
	}

	// Make sure it isn't a register that requires 64-bit mode.
	if (!Subtarget.is64Bit() &&
	(isFRClass(Res.second) \|\| isGRClass(Res.second)) &&
	TRI->getEncodingValue(Res.first) >= 8) {
	// Register requires REX prefix, but we're in 32-bit mode.
	return std::make_pair(0, nullptr);
	}

	// Make sure it isn't a register that requires AVX512.
	if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
	TRI->getEncodingValue(Res.first) & 0x10) {
	// Register requires EVEX prefix.
	return std::make_pair(0, nullptr);
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	bool is64Bit = Subtarget.is64Bit();
	const TargetRegisterClass *RC =
	Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
	: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
	: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
	: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
	: nullptr;
	if (Size == 64 && !is64Bit) {
	// Model GCC's behavior here and select a fixed pair of 32-bit
	// registers.
	switch (DestReg) {
	case X86::RAX:
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
	case X86::RDX:
	return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
	case X86::RCX:
	return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
	case X86::RBX:
	return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
	case X86::RSI:
	return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
	case X86::RDI:
	return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
	case X86::RBP:
	return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
	default:
	return std::make_pair(0, nullptr);
	}
	}
	if (RC && RC->contains(DestReg))
	return std::make_pair(DestReg, RC);
	return Res;
	}
	// No register found/type mismatch.
	return std::make_pair(0, nullptr);
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32XRegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
	Res.second = &X86::VR128XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
	Res.second = &X86::VR256XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isVKClass(*Class)) {
	if (VT == MVT::i1)
	Res.second = &X86::VK1RegClass;
	else if (VT == MVT::i8)
	Res.second = &X86::VK8RegClass;
	else if (VT == MVT::i16)
	Res.second = &X86::VK16RegClass;
	else if (VT == MVT::i32)
	Res.second = &X86::VK32RegClass;
	else if (VT == MVT::i64)
	Res.second = &X86::VK64RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%rdx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(
	Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef
	X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO() \|\|
	MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}
	Index: projects/clang900-import/contrib/llvm/lib/Target/X86/X86ISelLowering.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/X86/X86ISelLowering.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/X86/X86ISelLowering.h (revision 351722)
	@@ -1,1651 +1,1647 @@
	//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
	#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H

	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/Target/TargetOptions.h"

	namespace llvm {
	class X86Subtarget;
	class X86TargetMachine;

	namespace X86ISD {
	// X86 Specific DAG Nodes
	enum NodeType : unsigned {
	// Start the numbering where the builtin ops leave off.
	FIRST_NUMBER = ISD::BUILTIN_OP_END,

	/// Bit scan forward.
	BSF,
	/// Bit scan reverse.
	BSR,

	/// Double shift instructions. These correspond to
	/// X86::SHLDxx and X86::SHRDxx instructions.
	SHLD,
	SHRD,

	/// Bitwise logical AND of floating point values. This corresponds
	/// to X86::ANDPS or X86::ANDPD.
	FAND,

	/// Bitwise logical OR of floating point values. This corresponds
	/// to X86::ORPS or X86::ORPD.
	FOR,

	/// Bitwise logical XOR of floating point values. This corresponds
	/// to X86::XORPS or X86::XORPD.
	FXOR,

	/// Bitwise logical ANDNOT of floating point values. This
	/// corresponds to X86::ANDNPS or X86::ANDNPD.
	FANDN,

	/// These operations represent an abstract X86 call
	/// instruction, which includes a bunch of information. In particular the
	/// operands of these node are:
	///
	/// #0 - The incoming token chain
	/// #1 - The callee
	/// #2 - The number of arg bytes the caller pushes on the stack.
	/// #3 - The number of arg bytes the callee pops off the stack.
	/// #4 - The value to pass in AL/AX/EAX (optional)
	/// #5 - The value to pass in DL/DX/EDX (optional)
	///
	/// The result values of these nodes are:
	///
	/// #0 - The outgoing token chain
	/// #1 - The first register result value (optional)
	/// #2 - The second register result value (optional)
	///
	CALL,

	/// Same as call except it adds the NoTrack prefix.
	NT_CALL,

	/// X86 compare and logical compare instructions.
	CMP, COMI, UCOMI,

	/// X86 bit-test instructions.
	BT,

	/// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
	/// operand, usually produced by a CMP instruction.
	SETCC,

	/// X86 Select
	SELECTS,

	// Same as SETCC except it's materialized with a sbb and the value is all
	// one's or all zero's.
	SETCC_CARRY, // R = carry_bit ? ~0 : 0

	/// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
	/// Operands are two FP values to compare; result is a mask of
	/// 0s or 1s. Generally DTRT for C/C++ with NaNs.
	FSETCC,

	/// X86 FP SETCC, similar to above, but with output as an i1 mask and
	/// and a version with SAE.
	FSETCCM, FSETCCM_SAE,

	/// X86 conditional moves. Operand 0 and operand 1 are the two values
	/// to select from. Operand 2 is the condition code, and operand 3 is the
	/// flag operand produced by a CMP or TEST instruction.
	CMOV,

	/// X86 conditional branches. Operand 0 is the chain operand, operand 1
	/// is the block to branch if condition is true, operand 2 is the
	/// condition code, and operand 3 is the flag operand produced by a CMP
	/// or TEST instruction.
	BRCOND,

	/// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
	/// operand 1 is the target address.
	NT_BRIND,

	/// Return with a flag operand. Operand 0 is the chain operand, operand
	/// 1 is the number of bytes of stack to pop.
	RET_FLAG,

	/// Return from interrupt. Operand 0 is the number of bytes to pop.
	IRET,

	/// Repeat fill, corresponds to X86::REP_STOSx.
	REP_STOS,

	/// Repeat move, corresponds to X86::REP_MOVSx.
	REP_MOVS,

	/// On Darwin, this node represents the result of the popl
	/// at function entry, used for PIC code.
	GlobalBaseReg,

	/// A wrapper node for TargetConstantPool, TargetJumpTable,
	/// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
	/// MCSymbol and TargetBlockAddress.
	Wrapper,

	/// Special wrapper used under X86-64 PIC mode for RIP
	/// relative displacements.
	WrapperRIP,

	/// Copies a 64-bit value from the low word of an XMM vector
	/// to an MMX vector.
	MOVDQ2Q,

	/// Copies a 32-bit value from the low word of a MMX
	/// vector to a GPR.
	MMX_MOVD2W,

	/// Copies a GPR into the low 32-bit word of a MMX vector
	/// and zero out the high word.
	MMX_MOVW2D,

	/// Extract an 8-bit value from a vector and zero extend it to
	/// i32, corresponds to X86::PEXTRB.
	PEXTRB,

	/// Extract a 16-bit value from a vector and zero extend it to
	/// i32, corresponds to X86::PEXTRW.
	PEXTRW,

	/// Insert any element of a 4 x float vector into any element
	/// of a destination 4 x floatvector.
	INSERTPS,

	/// Insert the lower 8-bits of a 32-bit value to a vector,
	/// corresponds to X86::PINSRB.
	PINSRB,

	/// Insert the lower 16-bits of a 32-bit value to a vector,
	/// corresponds to X86::PINSRW.
	PINSRW,

	/// Shuffle 16 8-bit values within a vector.
	PSHUFB,

	/// Compute Sum of Absolute Differences.
	PSADBW,
	/// Compute Double Block Packed Sum-Absolute-Differences
	DBPSADBW,

	/// Bitwise Logical AND NOT of Packed FP values.
	ANDNP,

	/// Blend where the selector is an immediate.
	BLENDI,

	/// Dynamic (non-constant condition) vector blend where only the sign bits
	/// of the condition elements are used. This is used to enforce that the
	/// condition mask is not valid for generic VSELECT optimizations. This
	/// is also used to implement the intrinsics.
	/// Operands are in VSELECT order: MASK, TRUE, FALSE
	BLENDV,

	/// Combined add and sub on an FP vector.
	ADDSUB,

	// FP vector ops with rounding mode.
	FADD_RND, FADDS, FADDS_RND,
	FSUB_RND, FSUBS, FSUBS_RND,
	FMUL_RND, FMULS, FMULS_RND,
	FDIV_RND, FDIVS, FDIVS_RND,
	FMAX_SAE, FMAXS_SAE,
	FMIN_SAE, FMINS_SAE,
	FSQRT_RND, FSQRTS, FSQRTS_RND,

	// FP vector get exponent.
	FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
	// Extract Normalized Mantissas.
	VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
	// FP Scale.
	SCALEF, SCALEF_RND,
	SCALEFS, SCALEFS_RND,

	// Unsigned Integer average.
	AVG,

	/// Integer horizontal add/sub.
	HADD,
	HSUB,

	/// Floating point horizontal add/sub.
	FHADD,
	FHSUB,

	// Detect Conflicts Within a Vector
	CONFLICT,

	/// Floating point max and min.
	FMAX, FMIN,

	/// Commutative FMIN and FMAX.
	FMAXC, FMINC,

	/// Scalar intrinsic floating point max and min.
	FMAXS, FMINS,

	/// Floating point reciprocal-sqrt and reciprocal approximation.
	/// Note that these typically require refinement
	/// in order to obtain suitable precision.
	FRSQRT, FRCP,

	// AVX-512 reciprocal approximations with a little more precision.
	RSQRT14, RSQRT14S, RCP14, RCP14S,

	// Thread Local Storage.
	TLSADDR,

	// Thread Local Storage. A call to get the start address
	// of the TLS block for the current module.
	TLSBASEADDR,

	// Thread Local Storage. When calling to an OS provided
	// thunk at the address from an earlier relocation.
	TLSCALL,

	// Exception Handling helpers.
	EH_RETURN,

	// SjLj exception handling setjmp.
	EH_SJLJ_SETJMP,

	// SjLj exception handling longjmp.
	EH_SJLJ_LONGJMP,

	// SjLj exception handling dispatch.
	EH_SJLJ_SETUP_DISPATCH,

	/// Tail call return. See X86TargetLowering::LowerCall for
	/// the list of operands.
	TC_RETURN,

	// Vector move to low scalar and zero higher vector elements.
	VZEXT_MOVL,

	// Vector integer truncate.
	VTRUNC,
	// Vector integer truncate with unsigned/signed saturation.
	VTRUNCUS, VTRUNCS,

	// Masked version of the above. Used when less than a 128-bit result is
	// produced since the mask only applies to the lower elements and can't
	// be represented by a select.
	// SRC, PASSTHRU, MASK
	VMTRUNC, VMTRUNCUS, VMTRUNCS,

	// Vector FP extend.
	VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,

	// Vector FP round.
	VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,

	// Masked version of above. Used for v2f64->v4f32.
	// SRC, PASSTHRU, MASK
	VMFPROUND,

	// 128-bit vector logical left / right shift
	VSHLDQ, VSRLDQ,

	// Vector shift elements
	VSHL, VSRL, VSRA,

	// Vector variable shift
	VSHLV, VSRLV, VSRAV,

	// Vector shift elements by immediate
	VSHLI, VSRLI, VSRAI,

	// Shifts of mask registers.
	KSHIFTL, KSHIFTR,

	// Bit rotate by immediate
	VROTLI, VROTRI,

	// Vector packed double/float comparison.
	CMPP,

	// Vector integer comparisons.
	PCMPEQ, PCMPGT,

	// v8i16 Horizontal minimum and position.
	PHMINPOS,

	MULTISHIFT,

	/// Vector comparison generating mask bits for fp and
	/// integer signed and unsigned data types.
	CMPM,
	// Vector comparison with SAE for FP values
	CMPM_SAE,

	// Arithmetic operations with FLAGS results.
	ADD, SUB, ADC, SBB, SMUL, UMUL,
	OR, XOR, AND,

	// Bit field extract.
	BEXTR,

	// Zero High Bits Starting with Specified Bit Position.
	BZHI,

	// X86-specific multiply by immediate.
	MUL_IMM,

	// Vector sign bit extraction.
	MOVMSK,

	// Vector bitwise comparisons.
	PTEST,

	// Vector packed fp sign bitwise comparisons.
	TESTP,

	// OR/AND test for masks.
	KORTEST,
	KTEST,

	// ADD for masks.
	KADD,

	// Several flavors of instructions with vector shuffle behaviors.
	// Saturated signed/unnsigned packing.
	PACKSS,
	PACKUS,
	// Intra-lane alignr.
	PALIGNR,
	// AVX512 inter-lane alignr.
	VALIGN,
	PSHUFD,
	PSHUFHW,
	PSHUFLW,
	SHUFP,
	// VBMI2 Concat & Shift.
	VSHLD,
	VSHRD,
	VSHLDV,
	VSHRDV,
	//Shuffle Packed Values at 128-bit granularity.
	SHUF128,
	MOVDDUP,
	MOVSHDUP,
	MOVSLDUP,
	MOVLHPS,
	MOVHLPS,
	MOVSD,
	MOVSS,
	UNPCKL,
	UNPCKH,
	VPERMILPV,
	VPERMILPI,
	VPERMI,
	VPERM2X128,

	// Variable Permute (VPERM).
	// Res = VPERMV MaskV, V0
	VPERMV,

	// 3-op Variable Permute (VPERMT2).
	// Res = VPERMV3 V0, MaskV, V1
	VPERMV3,

	// Bitwise ternary logic.
	VPTERNLOG,
	// Fix Up Special Packed Float32/64 values.
	VFIXUPIMM, VFIXUPIMM_SAE,
	VFIXUPIMMS, VFIXUPIMMS_SAE,
	// Range Restriction Calculation For Packed Pairs of Float32/64 values.
	VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
	// Reduce - Perform Reduction Transformation on scalar\packed FP.
	VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
	// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
	// Also used by the legacy (V)ROUND intrinsics where we mask out the
	// scaling part of the immediate.
	VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
	// Tests Types Of a FP Values for packed types.
	VFPCLASS,
	// Tests Types Of a FP Values for scalar types.
	VFPCLASSS,

	// Broadcast scalar to vector.
	VBROADCAST,
	// Broadcast mask to vector.
	VBROADCASTM,
	// Broadcast subvector to vector.
	SUBV_BROADCAST,

	/// SSE4A Extraction and Insertion.
	EXTRQI, INSERTQI,

	// XOP arithmetic/logical shifts.
	VPSHA, VPSHL,
	// XOP signed/unsigned integer comparisons.
	VPCOM, VPCOMU,
	// XOP packed permute bytes.
	VPPERM,
	// XOP two source permutation.
	VPERMIL2,

	// Vector multiply packed unsigned doubleword integers.
	PMULUDQ,
	// Vector multiply packed signed doubleword integers.
	PMULDQ,
	// Vector Multiply Packed UnsignedIntegers with Round and Scale.
	MULHRS,

	// Multiply and Add Packed Integers.
	VPMADDUBSW, VPMADDWD,

	// AVX512IFMA multiply and add.
	// NOTE: These are different than the instruction and perform
	// op0 x op1 + op2.
	VPMADD52L, VPMADD52H,

	// VNNI
	VPDPBUSD,
	VPDPBUSDS,
	VPDPWSSD,
	VPDPWSSDS,

	// FMA nodes.
	// We use the target independent ISD::FMA for the non-inverted case.
	FNMADD,
	FMSUB,
	FNMSUB,
	FMADDSUB,
	FMSUBADD,

	// FMA with rounding mode.
	FMADD_RND,
	FNMADD_RND,
	FMSUB_RND,
	FNMSUB_RND,
	FMADDSUB_RND,
	FMSUBADD_RND,

	// Compress and expand.
	COMPRESS,
	EXPAND,

	// Bits shuffle
	VPSHUFBITQMB,

	// Convert Unsigned/Integer to Floating-Point Value with rounding mode.
	SINT_TO_FP_RND, UINT_TO_FP_RND,
	SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
	SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,

	// Vector float/double to signed/unsigned integer.
	CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
	// Scalar float/double to signed/unsigned integer.
	CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,

	// Vector float/double to signed/unsigned integer with truncation.
	CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
	// Scalar float/double to signed/unsigned integer with truncation.
	CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,

	// Vector signed/unsigned integer to float/double.
	CVTSI2P, CVTUI2P,

	// Masked versions of above. Used for v2f64->v4f32.
	// SRC, PASSTHRU, MASK
	MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
	MCVTSI2P, MCVTUI2P,

	// Vector float to bfloat16.
	// Convert TWO packed single data to one packed BF16 data
	CVTNE2PS2BF16,
	// Convert packed single data to packed BF16 data
	CVTNEPS2BF16,
	// Masked version of above.
	// SRC, PASSTHRU, MASK
	MCVTNEPS2BF16,

	// Dot product of BF16 pairs to accumulated into
	// packed single precision.
	DPBF16PS,

	// Save xmm argument registers to the stack, according to %al. An operator
	// is needed so that this can be expanded with control flow.
	VASTART_SAVE_XMM_REGS,

	// Windows's _chkstk call to do stack probing.
	WIN_ALLOCA,

	// For allocating variable amounts of stack space when using
	// segmented stacks. Check if the current stacklet has enough space, and
	// falls back to heap allocation if not.
	SEG_ALLOCA,

	// Memory barriers.
	MEMBARRIER,
	MFENCE,

	// Store FP status word into i16 register.
	FNSTSW16r,

	// Store contents of %ah into %eflags.
	SAHF,

	// Get a random integer and indicate whether it is valid in CF.
	RDRAND,

	// Get a NIST SP800-90B & C compliant random integer and
	// indicate whether it is valid in CF.
	RDSEED,

	// Protection keys
	// RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
	// WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
	// value for ECX.
	RDPKRU, WRPKRU,

	// SSE42 string comparisons.
	// These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
	// will emit one or two instructions based on which results are used. If
	// flags and index/mask this allows us to use a single instruction since
	// we won't have to pick and opcode for flags. Instead we can rely on the
	// DAG to CSE everything and decide at isel.
	PCMPISTR,
	PCMPESTR,

	// Test if in transactional execution.
	XTEST,

	// ERI instructions.
	RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
	RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,

	// Conversions between float and half-float.
	CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,

	// Masked version of above.
	// SRC, RND, PASSTHRU, MASK
	MCVTPS2PH,

	// Galois Field Arithmetic Instructions
	GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,

	// LWP insert record.
	LWPINS,

	// User level wait
	UMWAIT, TPAUSE,

	// Enqueue Stores Instructions
	ENQCMD, ENQCMDS,

	// For avx512-vp2intersect
	VP2INTERSECT,

	// Compare and swap.
	LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
	LCMPXCHG8_DAG,
	LCMPXCHG16_DAG,
	LCMPXCHG8_SAVE_EBX_DAG,
	LCMPXCHG16_SAVE_RBX_DAG,

	/// LOCK-prefixed arithmetic read-modify-write instructions.
	/// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
	LADD, LSUB, LOR, LXOR, LAND,

	// Load, scalar_to_vector, and zero extend.
	VZEXT_LOAD,

	// extract_vector_elt, store.
	VEXTRACT_STORE,

	// Store FP control world into i16 memory.
	FNSTCW16m,

	/// This instruction implements FP_TO_SINT with the
	/// integer destination in memory and a FP reg source. This corresponds
	/// to the X86::FIST*m instructions and the rounding mode change stuff. It
	/// has two inputs (token chain and address) and two outputs (int value
	/// and token chain). Memory VT specifies the type to store to.
	FP_TO_INT_IN_MEM,

	/// This instruction implements SINT_TO_FP with the
	/// integer source in memory and FP reg result. This corresponds to the
	/// X86::FILD*m instructions. It has two inputs (token chain and address)
	/// and two outputs (FP value and token chain). FILD_FLAG also produces a
	/// flag). The integer source type is specified by the memory VT.
	FILD,
	FILD_FLAG,

	/// This instruction implements a fp->int store from FP stack
	/// slots. This corresponds to the fist instruction. It takes a
	/// chain operand, value to store, address, and glue. The memory VT
	/// specifies the type to store as.
	FIST,

	/// This instruction implements an extending load to FP stack slots.
	/// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
	/// operand, and ptr to load from. The memory VT specifies the type to
	/// load from.
	FLD,

	/// This instruction implements a truncating store from FP stack
	/// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
	/// chain operand, value to store, address, and glue. The memory VT
	/// specifies the type to store as.
	FST,

	/// This instruction grabs the address of the next argument
	/// from a va_list. (reads and modifies the va_list in memory)
	VAARG_64,

	// Vector truncating store with unsigned/signed saturation
	VTRUNCSTOREUS, VTRUNCSTORES,
	// Vector truncating masked store with unsigned/signed saturation
	VMTRUNCSTOREUS, VMTRUNCSTORES,

	// X86 specific gather and scatter
	MGATHER, MSCATTER,

	// WARNING: Do not add anything in the end unless you want the node to
	// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
	// opcodes will be thought as target memory ops!
	};
	} // end namespace X86ISD

	/// Define some predicates that are used for node matching.
	namespace X86 {
	/// Returns true if Elt is a constant zero or floating point constant +0.0.
	bool isZeroNode(SDValue Elt);

	/// Returns true of the given offset can be
	/// fit into displacement field of the instruction.
	bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement = true);

	/// Determines whether the callee is required to pop its
	/// own arguments. Callee pop is necessary to support tail calls.
	bool isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO);

	} // end namespace X86

	//===--------------------------------------------------------------------===//
	// X86 Implementation of the TargetLowering interface
	class X86TargetLowering final : public TargetLowering {
	public:
	explicit X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI);

	unsigned getJumpTableEncoding() const override;
	bool useSoftFloat() const override;

	void markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const override;

	MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
	return MVT::i8;
	}

	const MCExpr *
	LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB, unsigned uid,
	MCContext &Ctx) const override;

	/// Returns relocation base for the given PIC jumptable.
	SDValue getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const override;
	const MCExpr *
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI, MCContext &Ctx) const override;

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contains are placed at 16-byte boundaries while the rest are at
	/// 4-byte boundaries.
	unsigned getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const override;

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const override;

	/// Returns true if it's safe to use load / store of the
	/// specified type to expand memcpy / memset inline. This is mostly true
	/// for all types except for some special cases. For example, on X86
	/// targets without SSE2 f64 load / store are done with fldl / fstpl which
	/// also does type conversion. Note the specified type doesn't have to be
	/// legal as the hook is used before type legalization.
	bool isSafeMemOpType(MVT VT) const override;

	/// Returns true if the target allows unaligned memory accesses of the
	/// specified type. Returns whether it is "fast" in the last argument.
	bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
	MachineMemOperand::Flags Flags,
	bool *Fast) const override;

	/// Provide custom lowering hooks for some operations.
	///
	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const override;

	/// Replace the results of node with an illegal result
	/// type with new values built out of custom code.
	///
	void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const override;

	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

	// Return true if it is profitable to combine a BUILD_VECTOR with a
	// stride-pattern to a shuffle and a truncate.
	// Example of such a combine:
	// v4i32 build_vector((extract_elt V, 1),
	// (extract_elt V, 3),
	// (extract_elt V, 5),
	// (extract_elt V, 7))
	// -->
	// v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
	// v4i64)
	bool isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;

	/// Return true if the target has native support for
	/// the specified value type and it is 'desirable' to use the type for the
	/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
	/// instruction encodings are longer and some i16 instructions are slow.
	bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;

	/// Return true if the target has native support for the
	/// specified value type and it is 'desirable' to use the type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer
	/// and some i16 instructions are slow.
	bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;

	MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const override;

	/// This method returns the name of a target specific DAG node.
	const char *getTargetNodeName(unsigned Opcode) const override;

	/// Do not merge vector stores after legalization because that may conflict
	/// with x86-specific store splitting optimizations.
	bool mergeStoresAfterLegalization(EVT MemVT) const override {
	return !MemVT.isVector();
	}

	bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const override;

	bool isCheapToSpeculateCttz() const override;

	bool isCheapToSpeculateCtlz() const override;

	bool isCtlzFast() const override;

	bool hasBitPreservingFPLogic(EVT VT) const override {
	return VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT.isVector();
	}

	bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
	// If the pair to store is a mixture of float and int values, we will
	// save two bitwise instructions and one float-to-int instruction and
	// increase one store instruction. There is potentially a more
	// significant benefit because it avoids the float->int domain switch
	// for input value. So It is more likely a win.
	if ((LTy.isFloatingPoint() && HTy.isInteger()) \|\|
	(LTy.isInteger() && HTy.isFloatingPoint()))
	return true;
	// If the pair only contains int values, we will save two bitwise
	// instructions and increase one store instruction (costing one more
	// store buffer). Since the benefit is more blurred so we leave
	// such pair out until we get testcase to prove it is a win.
	return false;
	}

	bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;

	bool hasAndNotCompare(SDValue Y) const override;

	bool hasAndNot(SDValue Y) const override;

	bool shouldFoldConstantShiftPairToMask(const SDNode *N,
	CombineLevel Level) const override;

	bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;

	bool
	shouldTransformSignedTruncationCheck(EVT XVT,
	unsigned KeptBits) const override {
	// For vectors, we don't have a preference..
	if (XVT.isVector())
	return false;

	auto VTIsOk = [](EVT VT) -> bool {
	return VT == MVT::i8 \|\| VT == MVT::i16 \|\| VT == MVT::i32 \|\|
	VT == MVT::i64;
	};

	// We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
	// XVT will be larger than KeptBitsVT.
	MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
	return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
	}

	- bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
	- if (DAG.getMachineFunction().getFunction().hasMinSize())
	- return false;
	- return true;
	- }
	+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;

	bool shouldSplatInsEltVarIndex(EVT VT) const override;

	bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
	return VT.isScalarInteger();
	}

	/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
	MVT hasFastEqualityCompare(unsigned NumBits) const override;

	/// Return the value type to use for ISD::SETCC.
	EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const override;

	bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
	TargetLoweringOpt &TLO) const override;

	/// Determine which of the bits specified in Mask are known to be either
	/// zero or one and return them in the KnownZero/KnownOne bitsets.
	void computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const override;

	/// Determine the number of bits in the operation that are sign bits.
	unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const override;

	bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	APInt &KnownUndef,
	APInt &KnownZero,
	TargetLoweringOpt &TLO,
	unsigned Depth) const override;

	bool SimplifyDemandedBitsForTargetNode(SDValue Op,
	const APInt &DemandedBits,
	const APInt &DemandedElts,
	KnownBits &Known,
	TargetLoweringOpt &TLO,
	unsigned Depth) const override;

	const Constant getTargetConstantFromLoad(LoadSDNode LD) const override;

	SDValue unwrapAddress(SDValue N) const override;

	SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;

	bool ExpandInlineAsm(CallInst *CI) const override;

	ConstraintType getConstraintType(StringRef Constraint) const override;

	/// Examine constraint string and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	ConstraintWeight
	getSingleConstraintMatchWeight(AsmOperandInfo &info,
	const char *constraint) const override;

	const char *LowerXConstraint(EVT ConstraintVT) const override;

	/// Lower the specified operand into the Ops vector. If it is invalid, don't
	/// add anything to Ops. If hasMemory is true it means one of the asm
	/// constraint of the inline asm instruction being processed is 'm'.
	void LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const override;

	unsigned
	getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
	if (ConstraintCode == "i")
	return InlineAsm::Constraint_i;
	else if (ConstraintCode == "o")
	return InlineAsm::Constraint_o;
	else if (ConstraintCode == "v")
	return InlineAsm::Constraint_v;
	else if (ConstraintCode == "X")
	return InlineAsm::Constraint_X;
	return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	/// Handle Lowering flag assembly outputs.
	SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL,
	const AsmOperandInfo &Constraint,
	SelectionDAG &DAG) const override;

	/// Given a physical register constraint
	/// (e.g. {edx}), return the register number and the register class for the
	/// register. This should only be used for C_Register constraints. On
	/// error, this returns a register number of 0.
	std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const override;

	/// Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AS,
	Instruction *I = nullptr) const override;

	/// Return true if the specified immediate is legal
	/// icmp immediate, that is the target has icmp instructions which can
	/// compare a register against the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalICmpImmediate(int64_t Imm) const override;

	/// Return true if the specified immediate is legal
	/// add immediate, that is the target has add instructions which can
	/// add a register and the immediate without having to materialize
	/// the immediate into a register.
	bool isLegalAddImmediate(int64_t Imm) const override;

	bool isLegalStoreImmediate(int64_t Imm) const override;

	/// Return the cost of the scaling factor used in the addressing
	/// mode represented by AM for this target, for a load/store
	/// of the specified type.
	/// If the AM is supported, the return value must be >= 0.
	/// If the AM is not supported, it returns a negative value.
	int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
	unsigned AS) const override;

	bool isVectorShiftByScalarCheap(Type *Ty) const override;

	/// Add x86-specific opcodes to the default list.
	bool isBinOp(unsigned Opcode) const override;

	/// Returns true if the opcode is a commutative binary operation.
	bool isCommutativeBinOp(unsigned Opcode) const override;

	/// Return true if it's free to truncate a value of
	/// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
	/// register EAX to i16 by referencing its sub-register AX.
	bool isTruncateFree(Type Ty1, Type Ty2) const override;
	bool isTruncateFree(EVT VT1, EVT VT2) const override;

	bool allowTruncateForTailCall(Type Ty1, Type Ty2) const override;

	/// Return true if any actual instruction that defines a
	/// value of type Ty1 implicit zero-extends the value to Ty2 in the result
	/// register. This does not necessarily include registers defined in
	/// unknown ways, such as incoming arguments, or copies from unknown
	/// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
	/// does not necessarily apply to truncate instructions. e.g. on x86-64,
	/// all instructions that define 32-bit values implicit zero-extend the
	/// result out to 64 bits.
	bool isZExtFree(Type Ty1, Type Ty2) const override;
	bool isZExtFree(EVT VT1, EVT VT2) const override;
	bool isZExtFree(SDValue Val, EVT VT2) const override;

	/// Return true if folding a vector load into ExtVal (a sign, zero, or any
	/// extend node) is profitable.
	bool isVectorLoadExtDesirable(SDValue) const override;

	/// Return true if an FMA operation is faster than a pair of fmul and fadd
	/// instructions. fmuladd intrinsics will be expanded to FMAs when this
	/// method returns true, otherwise fmuladd is expanded to fmul + fadd.
	bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;

	/// Return true if it's profitable to narrow
	/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
	/// from i32 to i8 but not from i32 to i16.
	bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;

	/// Given an intrinsic, checks if on the target the intrinsic will need to map
	/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
	/// true and stores the intrinsic information into the IntrinsicInfo that was
	/// passed to the function.
	bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const override;

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const override;

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
	/// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
	/// be legal.
	bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;

	/// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
	/// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
	/// constant pool entry.
	bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;

	/// Returns true if lowering to a jump table is allowed.
	bool areJTsAllowed(const Function *Fn) const override;

	/// If true, then instruction selection should
	/// seek to shrink the FP constant of the specified type to a smaller type
	/// in order to save space and / or reduce runtime.
	bool ShouldShrinkFPConstant(EVT VT) const override {
	// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
	// expensive than a straight movsd. On the other hand, it's important to
	// shrink long double fp constant since fldt is very slow.
	return !X86ScalarSSEf64 \|\| VT == MVT::f80;
	}

	/// Return true if we believe it is correct and profitable to reduce the
	/// load node to a smaller type.
	bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
	EVT NewVT) const override;

	/// Return true if the specified scalar FP type is computed in an SSE
	/// register, not on the X87 floating point stack.
	bool isScalarFPTypeInSSEReg(EVT VT) const {
	return (VT == MVT::f64 && X86ScalarSSEf64) \|\| // f64 is when SSE2
	(VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const override;

	bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override;

	bool convertSelectOfConstantsToMath(EVT VT) const override;

	bool decomposeMulByConstant(EVT VT, SDValue C) const override;

	bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
	bool IsSigned) const override;

	/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
	/// with this index.
	bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const override;

	/// Scalar ops always have equal or better analysis/performance/power than
	/// the vector equivalent, so this always makes sense if the scalar op is
	/// supported.
	bool shouldScalarizeBinop(SDValue) const override;

	/// Extract of a scalar FP value from index 0 of a vector is free.
	bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
	EVT EltVT = VT.getScalarType();
	return (EltVT == MVT::f32 \|\| EltVT == MVT::f64) && Index == 0;
	}

	/// Overflow nodes should get combined/lowered to optimal instructions
	/// (they should allow eliminating explicit compares by getting flags from
	/// math ops).
	bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override;

	bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
	unsigned AddrSpace) const override {
	// If we can replace more than 2 scalar stores, there will be a reduction
	// in instructions even after we add a vector constant load.
	return NumElem > 2;
	}

	bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const override;

	/// Intel processors have a unified instruction and data cache
	const char * getClearCacheBuiltinName() const override {
	return nullptr; // nothing to do, move along.
	}

	unsigned getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const override;

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const override;

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const override;

	virtual bool needsFixedCatchObjects() const override;

	/// This method returns a target specific FastISel object,
	/// or null if the target does not support "fast" ISel.
	FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const override;

	/// If the target has a standard location for the stack protector cookie,
	/// returns the address of that location. Otherwise, returns nullptr.
	Value *getIRStackGuard(IRBuilder<> &IRB) const override;

	bool useLoadStackGuardNode() const override;
	bool useStackGuardXorFP() const override;
	void insertSSPDeclarations(Module &M) const override;
	Value *getSDagStackGuard(const Module &M) const override;
	Function *getSSPStackGuardCheck(const Module &M) const override;
	SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const override;


	/// Return true if the target stores SafeStack pointer at a fixed offset in
	/// some non-standard address space, and populates the address space and
	/// offset as appropriate.
	Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;

	SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
	SelectionDAG &DAG) const;

	bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;

	/// Customize the preferred legalization strategy for certain types.
	LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;

	MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
	EVT VT) const override;

	unsigned getNumRegistersForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const override;

	bool isIntDivCheap(EVT VT, AttributeList Attr) const override;

	bool supportSwiftError() const override;

	StringRef getStackProbeSymbolName(MachineFunction &MF) const override;

	bool hasVectorBlend() const override { return true; }

	unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

	/// Lower interleaved load(s) into target specific
	/// instructions/intrinsics.
	bool lowerInterleavedLoad(LoadInst *LI,
	ArrayRef<ShuffleVectorInst *> Shuffles,
	ArrayRef<unsigned> Indices,
	unsigned Factor) const override;

	/// Lower interleaved store(s) into target specific
	/// instructions/intrinsics.
	bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
	unsigned Factor) const override;

	SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
	SDValue Addr, SelectionDAG &DAG)
	const override;

	protected:
	std::pair<const TargetRegisterClass *, uint8_t>
	findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const override;

	private:
	/// Keep a reference to the X86Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const X86Subtarget &Subtarget;

	/// Select between SSE or x87 floating point ops.
	/// When SSE is available, use it for f32 operations.
	/// When SSE2 is available, use it for f64 operations.
	bool X86ScalarSSEf32;
	bool X86ScalarSSEf64;

	/// A list of legal FP immediates.
	std::vector<APFloat> LegalFPImmediates;

	/// Indicate that this x86 target can instruction
	/// select the specified FP immediate natively.
	void addLegalFPImmediate(const APFloat& Imm) {
	LegalFPImmediates.push_back(Imm);
	}

	SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
	CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const;
	SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &ArgInfo,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA, MachineFrameInfo &MFI,
	unsigned i) const;
	SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const;

	// Call lowering helpers.

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool IsEligibleForTailCallOptimization(SDValue Callee,
	CallingConv::ID CalleeCC,
	bool isVarArg,
	bool isCalleeStructRet,
	bool isCallerStructRet,
	Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	SelectionDAG& DAG) const;
	SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
	SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff,
	const SDLoc &dl) const;

	unsigned GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG &DAG) const;

	unsigned getAddressSpace(void) const;

	SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned) const;

	SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;

	unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
	const unsigned char OpFlags = 0) const;
	SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;

	/// Creates target global address or external symbol nodes for calls or
	/// other uses.
	SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
	bool ForCall) const;

	SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;

	SDValue
	LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const override;
	SDValue LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const override;

	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const override;

	bool supportSplitCSR(MachineFunction *MF) const override {
	return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
	MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
	}
	void initializeSplitCSR(MachineBasicBlock *Entry) const override;
	void insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;

	bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;

	bool mayBeEmittedAsTailCall(const CallInst *CI) const override;

	EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const override;

	bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const override;

	const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;

	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
	bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
	TargetLoweringBase::AtomicExpansionKind
	shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;

	LoadInst *
	lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;

	bool needsCmpXchgNb(Type *MemType) const;

	void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB, int FI) const;

	// Utility function to emit the low-level va_arg code for X86-64.
	MachineBasicBlock *
	EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	/// Utility function to emit the xmm reg save portion of va_start.
	MachineBasicBlock *
	EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
	MachineInstr &MI2,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
	MachineBasicBlock *BB) const;

	MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	void emitSetJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent, for use with the given x86 condition code.
	SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG) const;

	/// Convert a comparison if required by the subtarget.
	SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;

	/// Emit flags for the given setcc condition and operands. Also returns the
	/// corresponding X86 condition code constant in X86CC.
	SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1,
	ISD::CondCode CC, const SDLoc &dl,
	SelectionDAG &DAG,
	SDValue &X86CC) const;

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;

	/// Use rsqrt* to speed up sqrt calculations.
	SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps, bool &UseOneConstNR,
	bool Reciprocal) const override;

	/// Use rcp* to speed up fdiv calculations.
	SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps) const override;

	/// Reassociate floating point divisions into multiply by reciprocal.
	unsigned combineRepeatedFPDivisors() const override;
	};

	namespace X86 {
	FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo);
	} // end namespace X86

	// Base class for all X86 non-masked store operations.
	class X86StoreSDNode : public MemSDNode {
	public:
	X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	:MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
	const SDValue &getValue() const { return getOperand(1); }
	const SDValue &getBasePtr() const { return getOperand(2); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTORES \|\|
	N->getOpcode() == X86ISD::VTRUNCSTOREUS;
	}
	};

	// Base class for all X86 masked store operations.
	// The class has the same order of operands as MaskedStoreSDNode for
	// convenience.
	class X86MaskedStoreSDNode : public MemSDNode {
	public:
	X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}

	const SDValue &getValue() const { return getOperand(1); }
	const SDValue &getBasePtr() const { return getOperand(2); }
	const SDValue &getMask() const { return getOperand(3); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTORES \|\|
	N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
	}
	};

	// X86 Truncating Store with Signed saturation.
	class TruncSStoreSDNode : public X86StoreSDNode {
	public:
	TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
	: X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTORES;
	}
	};

	// X86 Truncating Store with Unsigned saturation.
	class TruncUSStoreSDNode : public X86StoreSDNode {
	public:
	TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
	: X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
	}
	};

	// X86 Truncating Masked Store with Signed saturation.
	class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
	public:
	MaskedTruncSStoreSDNode(unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTORES;
	}
	};

	// X86 Truncating Masked Store with Unsigned saturation.
	class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
	public:
	MaskedTruncUSStoreSDNode(unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
	}
	};

	// X86 specific Gather/Scatter nodes.
	// The class has the same order of operands as MaskedGatherScatterSDNode for
	// convenience.
	class X86MaskedGatherScatterSDNode : public MemSDNode {
	public:
	X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
	const DebugLoc &dl, SDVTList VTs, EVT MemVT,
	MachineMemOperand *MMO)
	: MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}

	const SDValue &getBasePtr() const { return getOperand(3); }
	const SDValue &getIndex() const { return getOperand(4); }
	const SDValue &getMask() const { return getOperand(2); }
	const SDValue &getScale() const { return getOperand(5); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MGATHER \|\|
	N->getOpcode() == X86ISD::MSCATTER;
	}
	};

	class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
	public:
	X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
	EVT MemVT, MachineMemOperand *MMO)
	: X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
	MMO) {}

	const SDValue &getPassThru() const { return getOperand(1); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MGATHER;
	}
	};

	class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
	public:
	X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
	EVT MemVT, MachineMemOperand *MMO)
	: X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
	MMO) {}

	const SDValue &getValue() const { return getOperand(1); }

	static bool classof(const SDNode *N) {
	return N->getOpcode() == X86ISD::MSCATTER;
	}
	};

	/// Generate unpacklo/unpackhi shuffle mask.
	template <typename T = int>
	void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	int NumEltsInLane = 128 / VT.getScalarSizeInBits();
	for (int i = 0; i < NumElts; ++i) {
	unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
	int Pos = (i % NumEltsInLane) / 2 + LaneStart;
	Pos += (Unary ? 0 : NumElts * (i % 2));
	Pos += (Lo ? 0 : NumEltsInLane / 2);
	Mask.push_back(Pos);
	}
	}

	/// Helper function to scale a shuffle or target shuffle mask, replacing each
	/// mask index with the scaled sequential indices for an equivalent narrowed
	/// mask. This is the reverse process to canWidenShuffleElements, but can
	/// always succeed.
	template <typename T>
	void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
	SmallVectorImpl<T> &ScaledMask) {
	assert(0 < Scale && "Unexpected scaling factor");
	size_t NumElts = Mask.size();
	ScaledMask.assign(NumElts * Scale, -1);

	for (int i = 0; i != (int)NumElts; ++i) {
	int M = Mask[i];

	// Repeat sentinel values in every mask element.
	if (M < 0) {
	for (int s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = M;
	continue;
	}

	// Scale mask element and increment across each mask element.
	for (int s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = (Scale * M) + s;
	}
	}
	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
	Index: projects/clang900-import/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Target/X86/X86Subtarget.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Target/X86/X86Subtarget.cpp (revision 351722)
	@@ -1,377 +1,380 @@
	//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the X86 specific subclass of TargetSubtargetInfo.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"

	#include "X86CallLowering.h"
	#include "X86LegalizerInfo.h"
	#include "X86MacroFusion.h"
	#include "X86RegisterBankInfo.h"
	#include "X86Subtarget.h"
	#include "MCTargetDesc/X86BaseInfo.h"
	#include "X86TargetMachine.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/CodeGen/GlobalISel/CallLowering.h"
	#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"

	#if defined(_MSC_VER)
	#include <intrin.h>
	#endif

	using namespace llvm;

	#define DEBUG_TYPE "subtarget"

	#define GET_SUBTARGETINFO_TARGET_DESC
	#define GET_SUBTARGETINFO_CTOR
	#include "X86GenSubtargetInfo.inc"

	// Temporary option to control early if-conversion for x86 while adding machine
	// models.
	static cl::opt<bool>
	X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
	cl::desc("Enable early if-conversion on X86"));


	/// Classify a blockaddress reference for the current subtarget according to how
	/// we should reference it in a non-pcrel context.
	unsigned char X86Subtarget::classifyBlockAddressReference() const {
	return classifyLocalReference(nullptr);
	}

	/// Classify a global variable reference for the current subtarget according to
	/// how we should reference it in a non-pcrel context.
	unsigned char
	X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
	return classifyGlobalReference(GV, *GV->getParent());
	}

	unsigned char
	X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
	// If we're not PIC, it's not very interesting.
	if (!isPositionIndependent())
	return X86II::MO_NO_FLAG;

	if (is64Bit()) {
	// 64-bit ELF PIC local references may use GOTOFF relocations.
	if (isTargetELF()) {
	switch (TM.getCodeModel()) {
	// 64-bit small code model is simple: All rip-relative.
	case CodeModel::Tiny:
	llvm_unreachable("Tiny codesize model not supported on X86");
	case CodeModel::Small:
	case CodeModel::Kernel:
	return X86II::MO_NO_FLAG;

	// The large PIC code model uses GOTOFF.
	case CodeModel::Large:
	return X86II::MO_GOTOFF;

	// Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data.
	case CodeModel::Medium:
	if (isa<Function>(GV))
	return X86II::MO_NO_FLAG; // All code is RIP-relative
	return X86II::MO_GOTOFF; // Local symbols use GOTOFF.
	}
	llvm_unreachable("invalid code model");
	}

	// Otherwise, this is either a RIP-relative reference or a 64-bit movabsq,
	// both of which use MO_NO_FLAG.
	return X86II::MO_NO_FLAG;
	}

	// The COFF dynamic linker just patches the executable sections.
	if (isTargetCOFF())
	return X86II::MO_NO_FLAG;

	if (isTargetDarwin()) {
	// 32 bit macho has no relocation for a-b if a is undefined, even if
	// b is in the section that is being relocated.
	// This means we have to use o load even for GVs that are known to be
	// local to the dso.
	if (GV && (GV->isDeclarationForLinker() \|\| GV->hasCommonLinkage()))
	return X86II::MO_DARWIN_NONLAZY_PIC_BASE;

	return X86II::MO_PIC_BASE_OFFSET;
	}

	return X86II::MO_GOTOFF;
	}

	unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
	const Module &M) const {
	// The static large model never uses stubs.
	if (TM.getCodeModel() == CodeModel::Large && !isPositionIndependent())
	return X86II::MO_NO_FLAG;

	// Absolute symbols can be referenced directly.
	if (GV) {
	if (Optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) {
	// See if we can use the 8-bit immediate form. Note that some instructions
	// will sign extend the immediate operand, so to be conservative we only
	// accept the range [0,128).
	if (CR->getUnsignedMax().ult(128))
	return X86II::MO_ABS8;
	else
	return X86II::MO_NO_FLAG;
	}
	}

	if (TM.shouldAssumeDSOLocal(M, GV))
	return classifyLocalReference(GV);

	if (isTargetCOFF()) {
	if (GV->hasDLLImportStorageClass())
	return X86II::MO_DLLIMPORT;
	return X86II::MO_COFFSTUB;
	}
	+ // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables.
	+ if (isOSWindows())
	+ return X86II::MO_NO_FLAG;

	if (is64Bit()) {
	// ELF supports a large, truly PIC code model with non-PC relative GOT
	// references. Other object file formats do not. Use the no-flag, 64-bit
	// reference for them.
	if (TM.getCodeModel() == CodeModel::Large)
	return isTargetELF() ? X86II::MO_GOT : X86II::MO_NO_FLAG;
	return X86II::MO_GOTPCREL;
	}

	if (isTargetDarwin()) {
	if (!isPositionIndependent())
	return X86II::MO_DARWIN_NONLAZY;
	return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
	}

	return X86II::MO_GOT;
	}

	unsigned char
	X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
	return classifyGlobalFunctionReference(GV, *GV->getParent());
	}

	unsigned char
	X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
	const Module &M) const {
	if (TM.shouldAssumeDSOLocal(M, GV))
	return X86II::MO_NO_FLAG;

	// Functions on COFF can be non-DSO local for two reasons:
	// - They are marked dllimport
	// - They are extern_weak, and a stub is needed
	if (isTargetCOFF()) {
	if (GV->hasDLLImportStorageClass())
	return X86II::MO_DLLIMPORT;
	return X86II::MO_COFFSTUB;
	}

	const Function *F = dyn_cast_or_null<Function>(GV);

	if (isTargetELF()) {
	if (is64Bit() && F && (CallingConv::X86_RegCall == F->getCallingConv()))
	// According to psABI, PLT stub clobbers XMM8-XMM15.
	// In Regcall calling convention those registers are used for passing
	// parameters. Thus we need to prevent lazy binding in Regcall.
	return X86II::MO_GOTPCREL;
	// If PLT must be avoided then the call should be via GOTPCREL.
	if (((F && F->hasFnAttribute(Attribute::NonLazyBind)) \|\|
	(!F && M.getRtLibUseGOT())) &&
	is64Bit())
	return X86II::MO_GOTPCREL;
	return X86II::MO_PLT;
	}

	if (is64Bit()) {
	if (F && F->hasFnAttribute(Attribute::NonLazyBind))
	// If the function is marked as non-lazy, generate an indirect call
	// which loads from the GOT directly. This avoids runtime overhead
	// at the cost of eager binding (and one extra byte of encoding).
	return X86II::MO_GOTPCREL;
	return X86II::MO_NO_FLAG;
	}

	return X86II::MO_NO_FLAG;
	}

	/// Return true if the subtarget allows calls to immediate address.
	bool X86Subtarget::isLegalToCallImmediateAddr() const {
	// FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
	// but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does,
	// the following check for Win32 should be removed.
	if (In64BitMode \|\| isTargetWin32())
	return false;
	return isTargetELF() \|\| TM.getRelocationModel() == Reloc::Static;
	}

	void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
	std::string CPUName = CPU;
	if (CPUName.empty())
	CPUName = "generic";

	std::string FullFS = FS;
	if (In64BitMode) {
	// SSE2 should default to enabled in 64-bit mode, but can be turned off
	// explicitly.
	if (!FullFS.empty())
	FullFS = "+sse2," + FullFS;
	else
	FullFS = "+sse2";

	// If no CPU was specified, enable 64bit feature to satisy later check.
	if (CPUName == "generic") {
	if (!FullFS.empty())
	FullFS = "+64bit," + FullFS;
	else
	FullFS = "+64bit";
	}
	}

	// LAHF/SAHF are always supported in non-64-bit mode.
	if (!In64BitMode) {
	if (!FullFS.empty())
	FullFS = "+sahf," + FullFS;
	else
	FullFS = "+sahf";
	}

	// Parse features string and set the CPU.
	ParseSubtargetFeatures(CPUName, FullFS);

	// All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
	// 16-bytes and under that are reasonably fast. These features were
	// introduced with Intel's Nehalem/Silvermont and AMD's Family10h
	// micro-architectures respectively.
	if (hasSSE42() \|\| hasSSE4A())
	IsUAMem16Slow = false;

	// It's important to keep the MCSubtargetInfo feature bits in sync with
	// target data structure which is shared with MC code emitter, etc.
	if (In64BitMode)
	ToggleFeature(X86::Mode64Bit);
	else if (In32BitMode)
	ToggleFeature(X86::Mode32Bit);
	else if (In16BitMode)
	ToggleFeature(X86::Mode16Bit);
	else
	llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");

	LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
	<< ", 3DNowLevel " << X863DNowLevel << ", 64bit "
	<< HasX86_64 << "\n");
	if (In64BitMode && !HasX86_64)
	report_fatal_error("64-bit code requested on a subtarget that doesn't "
	"support it!");

	// Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
	// 32 and 64 bit) and for all 64-bit targets.
	if (StackAlignOverride)
	stackAlignment = StackAlignOverride;
	else if (isTargetDarwin() \|\| isTargetLinux() \|\| isTargetSolaris() \|\|
	isTargetKFreeBSD() \|\| In64BitMode)
	stackAlignment = 16;

	// Some CPUs have more overhead for gather. The specified overhead is relative
	// to the Load operation. "2" is the number provided by Intel architects. This
	// parameter is used for cost estimation of Gather Op and comparison with
	// other alternatives.
	// TODO: Remove the explicit hasAVX512()?, That would mean we would only
	// enable gather with a -march.
	if (hasAVX512() \|\| (hasAVX2() && hasFastGather()))
	GatherOverhead = 2;
	if (hasAVX512())
	ScatterOverhead = 2;

	// Consume the vector width attribute or apply any target specific limit.
	if (PreferVectorWidthOverride)
	PreferVectorWidth = PreferVectorWidthOverride;
	else if (Prefer256Bit)
	PreferVectorWidth = 256;
	}

	X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
	StringRef FS) {
	initSubtargetFeatures(CPU, FS);
	return *this;
	}

	X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
	const X86TargetMachine &TM,
	unsigned StackAlignOverride,
	unsigned PreferVectorWidthOverride,
	unsigned RequiredVectorWidth)
	: X86GenSubtargetInfo(TT, CPU, FS),
	PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
	StackAlignOverride(StackAlignOverride),
	PreferVectorWidthOverride(PreferVectorWidthOverride),
	RequiredVectorWidth(RequiredVectorWidth),
	In64BitMode(TargetTriple.getArch() == Triple::x86_64),
	In32BitMode(TargetTriple.getArch() == Triple::x86 &&
	TargetTriple.getEnvironment() != Triple::CODE16),
	In16BitMode(TargetTriple.getArch() == Triple::x86 &&
	TargetTriple.getEnvironment() == Triple::CODE16),
	InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
	FrameLowering(*this, getStackAlignment()) {
	// Determine the PICStyle based on the target selected.
	if (!isPositionIndependent())
	setPICStyle(PICStyles::None);
	else if (is64Bit())
	setPICStyle(PICStyles::RIPRel);
	else if (isTargetCOFF())
	setPICStyle(PICStyles::None);
	else if (isTargetDarwin())
	setPICStyle(PICStyles::StubPIC);
	else if (isTargetELF())
	setPICStyle(PICStyles::GOT);

	CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
	Legalizer.reset(new X86LegalizerInfo(*this, TM));

	auto RBI = new X86RegisterBankInfo(getRegisterInfo());
	RegBankInfo.reset(RBI);
	InstSelector.reset(createX86InstructionSelector(TM, this, RBI));
	}

	const CallLowering *X86Subtarget::getCallLowering() const {
	return CallLoweringInfo.get();
	}

	const InstructionSelector *X86Subtarget::getInstructionSelector() const {
	return InstSelector.get();
	}

	const LegalizerInfo *X86Subtarget::getLegalizerInfo() const {
	return Legalizer.get();
	}

	const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
	return RegBankInfo.get();
	}

	bool X86Subtarget::enableEarlyIfConversion() const {
	return hasCMov() && X86EarlyIfConv;
	}

	void X86Subtarget::getPostRAMutations(
	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
	Mutations.push_back(createX86MacroFusionDAGMutation());
	}
	Index: projects/clang900-import/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp (revision 351722)
	@@ -1,976 +1,978 @@
	//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements some loop unrolling utilities. It does not define any
	// actual pass or policy, but provides a single function to perform loop
	// unrolling.
	//
	// The process of unrolling can produce extraneous basic blocks linked with
	// unconditional branches. This will be corrected in the future.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/LoopIterator.h"
	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
	#include "llvm/Analysis/ScalarEvolution.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
	#include "llvm/Transforms/Utils/Cloning.h"
	#include "llvm/Transforms/Utils/LoopSimplify.h"
	#include "llvm/Transforms/Utils/LoopUtils.h"
	#include "llvm/Transforms/Utils/SimplifyIndVar.h"
	#include "llvm/Transforms/Utils/UnrollLoop.h"
	using namespace llvm;

	#define DEBUG_TYPE "loop-unroll"

	// TODO: Should these be here or in LoopUnroll?
	STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
	STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
	STATISTIC(NumUnrolledWithHeader, "Number of loops unrolled without a "
	"conditional latch (completely or otherwise)");

	static cl::opt<bool>
	UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
	cl::desc("Allow runtime unrolled loops to be unrolled "
	"with epilog instead of prolog."));

	static cl::opt<bool>
	UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
	cl::desc("Verify domtree after unrolling"),
	#ifdef EXPENSIVE_CHECKS
	cl::init(true)
	#else
	cl::init(false)
	#endif
	);

	/// Convert the instruction operands from referencing the current values into
	/// those specified by VMap.
	void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
	for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
	Value *Op = I->getOperand(op);

	// Unwrap arguments of dbg.value intrinsics.
	bool Wrapped = false;
	if (auto *V = dyn_cast<MetadataAsValue>(Op))
	if (auto *Unwrapped = dyn_cast<ValueAsMetadata>(V->getMetadata())) {
	Op = Unwrapped->getValue();
	Wrapped = true;
	}

	auto wrap = [&](Value *V) {
	auto &C = I->getContext();
	return Wrapped ? MetadataAsValue::get(C, ValueAsMetadata::get(V)) : V;
	};

	ValueToValueMapTy::iterator It = VMap.find(Op);
	if (It != VMap.end())
	I->setOperand(op, wrap(It->second));
	}

	if (PHINode *PN = dyn_cast<PHINode>(I)) {
	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
	if (It != VMap.end())
	PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
	}
	}
	}

	/// Check if unrolling created a situation where we need to insert phi nodes to
	/// preserve LCSSA form.
	/// \param Blocks is a vector of basic blocks representing unrolled loop.
	/// \param L is the outer loop.
	/// It's possible that some of the blocks are in L, and some are not. In this
	/// case, if there is a use is outside L, and definition is inside L, we need to
	/// insert a phi-node, otherwise LCSSA will be broken.
	/// The function is just a helper function for llvm::UnrollLoop that returns
	/// true if this situation occurs, indicating that LCSSA needs to be fixed.
	static bool needToInsertPhisForLCSSA(Loop L, std::vector<BasicBlock > Blocks,
	LoopInfo *LI) {
	for (BasicBlock *BB : Blocks) {
	if (LI->getLoopFor(BB) == L)
	continue;
	for (Instruction &I : *BB) {
	for (Use &U : I.operands()) {
	if (auto Def = dyn_cast<Instruction>(U)) {
	Loop *DefLoop = LI->getLoopFor(Def->getParent());
	if (!DefLoop)
	continue;
	if (DefLoop->contains(L))
	return true;
	}
	}
	}
	}
	return false;
	}

	/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
	/// and adds a mapping from the original loop to the new loop to NewLoops.
	/// Returns nullptr if no new loop was created and a pointer to the
	/// original loop OriginalBB was part of otherwise.
	const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
	BasicBlock ClonedBB, LoopInfo LI,
	NewLoopsMap &NewLoops) {
	// Figure out which loop New is in.
	const Loop *OldLoop = LI->getLoopFor(OriginalBB);
	assert(OldLoop && "Should (at least) be in the loop being unrolled!");

	Loop *&NewLoop = NewLoops[OldLoop];
	if (!NewLoop) {
	// Found a new sub-loop.
	assert(OriginalBB == OldLoop->getHeader() &&
	"Header should be first in RPO");

	NewLoop = LI->AllocateLoop();
	Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());

	if (NewLoopParent)
	NewLoopParent->addChildLoop(NewLoop);
	else
	LI->addTopLevelLoop(NewLoop);

	NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
	return OldLoop;
	} else {
	NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
	return nullptr;
	}
	}

	/// The function chooses which type of unroll (epilog or prolog) is more
	/// profitabale.
	/// Epilog unroll is more profitable when there is PHI that starts from
	/// constant. In this case epilog will leave PHI start from constant,
	/// but prolog will convert it to non-constant.
	///
	/// loop:
	/// PN = PHI [I, Latch], [CI, PreHeader]
	/// I = foo(PN)
	/// ...
	///
	/// Epilog unroll case.
	/// loop:
	/// PN = PHI [I2, Latch], [CI, PreHeader]
	/// I1 = foo(PN)
	/// I2 = foo(I1)
	/// ...
	/// Prolog unroll case.
	/// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
	/// loop:
	/// PN = PHI [I2, Latch], [NewPN, PreHeader]
	/// I1 = foo(PN)
	/// I2 = foo(I1)
	/// ...
	///
	static bool isEpilogProfitable(Loop *L) {
	BasicBlock *PreHeader = L->getLoopPreheader();
	BasicBlock *Header = L->getHeader();
	assert(PreHeader && Header);
	for (const PHINode &PN : Header->phis()) {
	if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
	return true;
	}
	return false;
	}

	/// Perform some cleanup and simplifications on loops after unrolling. It is
	/// useful to simplify the IV's in the new loop, as well as do a quick
	/// simplify/dce pass of the instructions.
	void llvm::simplifyLoopAfterUnroll(Loop L, bool SimplifyIVs, LoopInfo LI,
	ScalarEvolution SE, DominatorTree DT,
	AssumptionCache *AC) {
	// Simplify any new induction variables in the partially unrolled loop.
	if (SE && SimplifyIVs) {
	SmallVector<WeakTrackingVH, 16> DeadInsts;
	simplifyLoopIVs(L, SE, DT, LI, DeadInsts);

	// Aggressively clean up dead instructions that simplifyLoopIVs already
	// identified. Any remaining should be cleaned up below.
	while (!DeadInsts.empty())
	if (Instruction *Inst =
	dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
	RecursivelyDeleteTriviallyDeadInstructions(Inst);
	}

	// At this point, the code is well formed. We now do a quick sweep over the
	// inserted code, doing constant propagation and dead code elimination as we
	// go.
	const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
	for (BasicBlock *BB : L->getBlocks()) {
	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
	Instruction Inst = &I++;

	if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
	if (LI->replacementPreservesLCSSAForm(Inst, V))
	Inst->replaceAllUsesWith(V);
	if (isInstructionTriviallyDead(Inst))
	BB->getInstList().erase(Inst);
	}
	}

	// TODO: after peeling or unrolling, previously loop variant conditions are
	// likely to fold to constants, eagerly propagating those here will require
	// fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
	// appropriate.
	}

	/// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
	/// can only fail when the loop's latch block is not terminated by a conditional
	/// branch instruction. However, if the trip count (and multiple) are not known,
	/// loop unrolling will mostly produce more code that is no faster.
	///
	/// TripCount is the upper bound of the iteration on which control exits
	/// LatchBlock. Control may exit the loop prior to TripCount iterations either
	/// via an early branch in other loop block or via LatchBlock terminator. This
	/// is relaxed from the general definition of trip count which is the number of
	/// times the loop header executes. Note that UnrollLoop assumes that the loop
	/// counter test is in LatchBlock in order to remove unnecesssary instances of
	/// the test. If control can exit the loop from the LatchBlock's terminator
	/// prior to TripCount iterations, flag PreserveCondBr needs to be set.
	///
	/// PreserveCondBr indicates whether the conditional branch of the LatchBlock
	/// needs to be preserved. It is needed when we use trip count upper bound to
	/// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
	/// conditional branch needs to be preserved.
	///
	/// Similarly, TripMultiple divides the number of times that the LatchBlock may
	/// execute without exiting the loop.
	///
	/// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
	/// have a runtime (i.e. not compile time constant) trip count. Unrolling these
	/// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
	/// iterations before branching into the unrolled loop. UnrollLoop will not
	/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
	/// AllowExpensiveTripCount is false.
	///
	/// If we want to perform PGO-based loop peeling, PeelCount is set to the
	/// number of iterations we want to peel off.
	///
	/// The LoopInfo Analysis that is passed will be kept consistent.
	///
	/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
	/// DominatorTree if they are non-null.
	///
	/// If RemainderLoop is non-null, it will receive the remainder loop (if
	/// required and not fully unrolled).
	LoopUnrollResult llvm::UnrollLoop(Loop L, UnrollLoopOptions ULO, LoopInfo LI,
	ScalarEvolution SE, DominatorTree DT,
	AssumptionCache *AC,
	OptimizationRemarkEmitter *ORE,
	bool PreserveLCSSA, Loop **RemainderLoop) {

	BasicBlock *Preheader = L->getLoopPreheader();
	if (!Preheader) {
	LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
	return LoopUnrollResult::Unmodified;
	}

	BasicBlock *LatchBlock = L->getLoopLatch();
	if (!LatchBlock) {
	LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
	return LoopUnrollResult::Unmodified;
	}

	// Loops with indirectbr cannot be cloned.
	if (!L->isSafeToClone()) {
	LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
	return LoopUnrollResult::Unmodified;
	}

	// The current loop unroll pass can unroll loops with a single latch or header
	// that's a conditional branch exiting the loop.
	// FIXME: The implementation can be extended to work with more complicated
	// cases, e.g. loops with multiple latches.
	BasicBlock *Header = L->getHeader();
	BranchInst *HeaderBI = dyn_cast<BranchInst>(Header->getTerminator());
	BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());

	// FIXME: Support loops without conditional latch and multiple exiting blocks.
	if (!BI \|\|
	(BI->isUnconditional() && (!HeaderBI \|\| HeaderBI->isUnconditional() \|\|
	L->getExitingBlock() != Header))) {
	LLVM_DEBUG(dbgs() << " Can't unroll; loop not terminated by a conditional "
	"branch in the latch or header.\n");
	return LoopUnrollResult::Unmodified;
	}

	auto CheckLatchSuccessors = [&](unsigned S1, unsigned S2) {
	return BI->isConditional() && BI->getSuccessor(S1) == Header &&
	!L->contains(BI->getSuccessor(S2));
	};

	// If we have a conditional latch, it must exit the loop.
	if (BI && BI->isConditional() && !CheckLatchSuccessors(0, 1) &&
	!CheckLatchSuccessors(1, 0)) {
	LLVM_DEBUG(
	dbgs() << "Can't unroll; a conditional latch must exit the loop");
	return LoopUnrollResult::Unmodified;
	}

	auto CheckHeaderSuccessors = [&](unsigned S1, unsigned S2) {
	return HeaderBI && HeaderBI->isConditional() &&
	L->contains(HeaderBI->getSuccessor(S1)) &&
	!L->contains(HeaderBI->getSuccessor(S2));
	};

	// If we do not have a conditional latch, the header must exit the loop.
	if (BI && !BI->isConditional() && HeaderBI && HeaderBI->isConditional() &&
	!CheckHeaderSuccessors(0, 1) && !CheckHeaderSuccessors(1, 0)) {
	LLVM_DEBUG(dbgs() << "Can't unroll; conditional header must exit the loop");
	return LoopUnrollResult::Unmodified;
	}

	if (Header->hasAddressTaken()) {
	// The loop-rotate pass can be helpful to avoid this in many cases.
	LLVM_DEBUG(
	dbgs() << " Won't unroll loop: address of header block is taken.\n");
	return LoopUnrollResult::Unmodified;
	}

	if (ULO.TripCount != 0)
	LLVM_DEBUG(dbgs() << " Trip Count = " << ULO.TripCount << "\n");
	if (ULO.TripMultiple != 1)
	LLVM_DEBUG(dbgs() << " Trip Multiple = " << ULO.TripMultiple << "\n");

	// Effectively "DCE" unrolled iterations that are beyond the tripcount
	// and will never be executed.
	if (ULO.TripCount != 0 && ULO.Count > ULO.TripCount)
	ULO.Count = ULO.TripCount;

	// Don't enter the unroll code if there is nothing to do.
	if (ULO.TripCount == 0 && ULO.Count < 2 && ULO.PeelCount == 0) {
	LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
	return LoopUnrollResult::Unmodified;
	}

	assert(ULO.Count > 0);
	assert(ULO.TripMultiple > 0);
	assert(ULO.TripCount == 0 \|\| ULO.TripCount % ULO.TripMultiple == 0);

	// Are we eliminating the loop control altogether?
	bool CompletelyUnroll = ULO.Count == ULO.TripCount;
	SmallVector<BasicBlock *, 4> ExitBlocks;
	L->getExitBlocks(ExitBlocks);
	std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();

	// Go through all exits of L and see if there are any phi-nodes there. We just
	// conservatively assume that they're inserted to preserve LCSSA form, which
	// means that complete unrolling might break this form. We need to either fix
	// it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
	// now we just recompute LCSSA for the outer loop, but it should be possible
	// to fix it in-place.
	bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
	any_of(ExitBlocks, [](const BasicBlock *BB) {
	return isa<PHINode>(BB->begin());
	});

	// We assume a run-time trip count if the compiler cannot
	// figure out the loop trip count and the unroll-runtime
	// flag is specified.
	bool RuntimeTripCount =
	(ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime);

	assert((!RuntimeTripCount \|\| !ULO.PeelCount) &&
	"Did not expect runtime trip-count unrolling "
	"and peeling for the same loop");

	bool Peeled = false;
	if (ULO.PeelCount) {
	Peeled = peelLoop(L, ULO.PeelCount, LI, SE, DT, AC, PreserveLCSSA);

	// Successful peeling may result in a change in the loop preheader/trip
	// counts. If we later unroll the loop, we want these to be updated.
	if (Peeled) {
	// According to our guards and profitability checks the only
	// meaningful exit should be latch block. Other exits go to deopt,
	// so we do not worry about them.
	BasicBlock *ExitingBlock = L->getLoopLatch();
	assert(ExitingBlock && "Loop without exiting block?");
	assert(L->isLoopExiting(ExitingBlock) && "Latch is not exiting?");
	Preheader = L->getLoopPreheader();
	ULO.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
	ULO.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
	}
	}

	// Loops containing convergent instructions must have a count that divides
	// their TripMultiple.
	LLVM_DEBUG(
	{
	bool HasConvergent = false;
	for (auto &BB : L->blocks())
	for (auto &I : *BB)
	if (auto CS = CallSite(&I))
	HasConvergent \|= CS.isConvergent();
	assert((!HasConvergent \|\| ULO.TripMultiple % ULO.Count == 0) &&
	"Unroll count must divide trip multiple if loop contains a "
	"convergent operation.");
	});

	bool EpilogProfitability =
	UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
	: isEpilogProfitable(L);

	if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 &&
	!UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
	EpilogProfitability, ULO.UnrollRemainder,
	ULO.ForgetAllSCEV, LI, SE, DT, AC,
	PreserveLCSSA, RemainderLoop)) {
	if (ULO.Force)
	RuntimeTripCount = false;
	else {
	LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
	"generated when assuming runtime trip count\n");
	return LoopUnrollResult::Unmodified;
	}
	}

	// If we know the trip count, we know the multiple...
	unsigned BreakoutTrip = 0;
	if (ULO.TripCount != 0) {
	BreakoutTrip = ULO.TripCount % ULO.Count;
	ULO.TripMultiple = 0;
	} else {
	// Figure out what multiple to use.
	BreakoutTrip = ULO.TripMultiple =
	(unsigned)GreatestCommonDivisor64(ULO.Count, ULO.TripMultiple);
	}

	using namespace ore;
	// Report the unrolling decision.
	if (CompletelyUnroll) {
	LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
	<< " with trip count " << ULO.TripCount << "!\n");
	if (ORE)
	ORE->emit([&]() {
	return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
	L->getHeader())
	<< "completely unrolled loop with "
	<< NV("UnrollCount", ULO.TripCount) << " iterations";
	});
	} else if (ULO.PeelCount) {
	LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
	<< " with iteration count " << ULO.PeelCount << "!\n");
	if (ORE)
	ORE->emit([&]() {
	return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
	L->getHeader())
	<< " peeled loop by " << NV("PeelCount", ULO.PeelCount)
	<< " iterations";
	});
	} else {
	auto DiagBuilder = [&]() {
	OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
	L->getHeader());
	return Diag << "unrolled loop by a factor of "
	<< NV("UnrollCount", ULO.Count);
	};

	LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
	<< ULO.Count);
	if (ULO.TripMultiple == 0 \|\| BreakoutTrip != ULO.TripMultiple) {
	LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
	if (ORE)
	ORE->emit([&]() {
	return DiagBuilder() << " with a breakout at trip "
	<< NV("BreakoutTrip", BreakoutTrip);
	});
	} else if (ULO.TripMultiple != 1) {
	LLVM_DEBUG(dbgs() << " with " << ULO.TripMultiple << " trips per branch");
	if (ORE)
	ORE->emit([&]() {
	return DiagBuilder()
	<< " with " << NV("TripMultiple", ULO.TripMultiple)
	<< " trips per branch";
	});
	} else if (RuntimeTripCount) {
	LLVM_DEBUG(dbgs() << " with run-time trip count");
	if (ORE)
	ORE->emit(
	[&]() { return DiagBuilder() << " with run-time trip count"; });
	}
	LLVM_DEBUG(dbgs() << "!\n");
	}

	// We are going to make changes to this loop. SCEV may be keeping cached info
	// about it, in particular about backedge taken count. The changes we make
	// are guaranteed to invalidate this information for our loop. It is tempting
	// to only invalidate the loop being unrolled, but it is incorrect as long as
	// all exiting branches from all inner loops have impact on the outer loops,
	// and if something changes inside them then any of outer loops may also
	// change. When we forget outermost loop, we also forget all contained loops
	// and this is what we need here.
	if (SE) {
	if (ULO.ForgetAllSCEV)
	SE->forgetAllLoops();
	else
	SE->forgetTopmostLoop(L);
	}

	bool ContinueOnTrue;
	bool LatchIsExiting = BI->isConditional();
	BasicBlock *LoopExit = nullptr;
	if (LatchIsExiting) {
	ContinueOnTrue = L->contains(BI->getSuccessor(0));
	LoopExit = BI->getSuccessor(ContinueOnTrue);
	} else {
	NumUnrolledWithHeader++;
	ContinueOnTrue = L->contains(HeaderBI->getSuccessor(0));
	LoopExit = HeaderBI->getSuccessor(ContinueOnTrue);
	}

	// For the first iteration of the loop, we should use the precloned values for
	// PHI nodes. Insert associations now.
	ValueToValueMapTy LastValueMap;
	std::vector<PHINode*> OrigPHINode;
	for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
	OrigPHINode.push_back(cast<PHINode>(I));
	}

	std::vector<BasicBlock *> Headers;
	std::vector<BasicBlock *> HeaderSucc;
	std::vector<BasicBlock *> Latches;
	Headers.push_back(Header);
	Latches.push_back(LatchBlock);

	if (!LatchIsExiting) {
	auto *Term = cast<BranchInst>(Header->getTerminator());
	if (Term->isUnconditional() \|\| L->contains(Term->getSuccessor(0))) {
	assert(L->contains(Term->getSuccessor(0)));
	HeaderSucc.push_back(Term->getSuccessor(0));
	} else {
	assert(L->contains(Term->getSuccessor(1)));
	HeaderSucc.push_back(Term->getSuccessor(1));
	}
	}

	// The current on-the-fly SSA update requires blocks to be processed in
	// reverse postorder so that LastValueMap contains the correct value at each
	// exit.
	LoopBlocksDFS DFS(L);
	DFS.perform(LI);

	// Stash the DFS iterators before adding blocks to the loop.
	LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
	LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();

	std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();

	// Loop Unrolling might create new loops. While we do preserve LoopInfo, we
	// might break loop-simplified form for these loops (as they, e.g., would
	// share the same exit blocks). We'll keep track of loops for which we can
	// break this so that later we can re-simplify them.
	SmallSetVector<Loop *, 4> LoopsToSimplify;
	for (Loop SubLoop : L)
	LoopsToSimplify.insert(SubLoop);

	if (Header->getParent()->isDebugInfoForProfiling())
	for (BasicBlock *BB : L->getBlocks())
	for (Instruction &I : *BB)
	if (!isa<DbgInfoIntrinsic>(&I))
	if (const DILocation *DIL = I.getDebugLoc()) {
	auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count);
	if (NewDIL)
	I.setDebugLoc(NewDIL.getValue());
	else
	LLVM_DEBUG(dbgs()
	<< "Failed to create new discriminator: "
	<< DIL->getFilename() << " Line: " << DIL->getLine());
	}

	for (unsigned It = 1; It != ULO.Count; ++It) {
	std::vector<BasicBlock*> NewBlocks;
	SmallDenseMap<const Loop , Loop , 4> NewLoops;
	NewLoops[L] = L;

	for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
	ValueToValueMapTy VMap;
	BasicBlock New = CloneBasicBlock(BB, VMap, "." + Twine(It));
	Header->getParent()->getBasicBlockList().push_back(New);

	assert((BB != Header \|\| LI->getLoopFor(BB) == L) &&
	"Header should not be in a sub-loop");
	// Tell LI about New.
	const Loop OldLoop = addClonedBlockToLoopInfo(BB, New, LI, NewLoops);
	if (OldLoop)
	LoopsToSimplify.insert(NewLoops[OldLoop]);

	if (*BB == Header)
	// Loop over all of the PHI nodes in the block, changing them to use
	// the incoming values from the previous block.
	for (PHINode *OrigPHI : OrigPHINode) {
	PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
	Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
	if (Instruction *InValI = dyn_cast<Instruction>(InVal))
	if (It > 1 && L->contains(InValI))
	InVal = LastValueMap[InValI];
	VMap[OrigPHI] = InVal;
	New->getInstList().erase(NewPHI);
	}

	// Update our running map of newest clones
	LastValueMap[*BB] = New;
	for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
	VI != VE; ++VI)
	LastValueMap[VI->first] = VI->second;

	// Add phi entries for newly created values to all exit blocks.
	for (BasicBlock Succ : successors(BB)) {
	if (L->contains(Succ))
	continue;
	for (PHINode &PHI : Succ->phis()) {
	Value Incoming = PHI.getIncomingValueForBlock(BB);
	ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
	if (It != LastValueMap.end())
	Incoming = It->second;
	PHI.addIncoming(Incoming, New);
	}
	}
	// Keep track of new headers and latches as we create them, so that
	// we can insert the proper branches later.
	if (*BB == Header)
	Headers.push_back(New);
	if (*BB == LatchBlock)
	Latches.push_back(New);

	// Keep track of the successor of the new header in the current iteration.
	for (auto Pred : predecessors(BB))
	if (Pred == Header) {
	HeaderSucc.push_back(New);
	break;
	}

	NewBlocks.push_back(New);
	UnrolledLoopBlocks.push_back(New);

	// Update DomTree: since we just copy the loop body, and each copy has a
	// dedicated entry block (copy of the header block), this header's copy
	// dominates all copied blocks. That means, dominance relations in the
	// copied body are the same as in the original body.
	if (DT) {
	if (*BB == Header)
	DT->addNewBlock(New, Latches[It - 1]);
	else {
	auto BBDomNode = DT->getNode(*BB);
	auto BBIDom = BBDomNode->getIDom();
	BasicBlock *OriginalBBIDom = BBIDom->getBlock();
	DT->addNewBlock(
	New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
	}
	}
	}

	// Remap all instructions in the most recent iteration
	for (BasicBlock *NewBlock : NewBlocks) {
	for (Instruction &I : *NewBlock) {
	::remapInstruction(&I, LastValueMap);
	if (auto *II = dyn_cast<IntrinsicInst>(&I))
	if (II->getIntrinsicID() == Intrinsic::assume)
	AC->registerAssumption(II);
	}
	}
	}

	// Loop over the PHI nodes in the original block, setting incoming values.
	for (PHINode *PN : OrigPHINode) {
	if (CompletelyUnroll) {
	PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
	Header->getInstList().erase(PN);
	} else if (ULO.Count > 1) {
	Value *InVal = PN->removeIncomingValue(LatchBlock, false);
	// If this value was defined in the loop, take the value defined by the
	// last iteration of the loop.
	if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
	if (L->contains(InValI))
	InVal = LastValueMap[InVal];
	}
	assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
	PN->addIncoming(InVal, Latches.back());
	}
	}

	auto setDest = [LoopExit, ContinueOnTrue](BasicBlock Src, BasicBlock Dest,
	ArrayRef<BasicBlock *> NextBlocks,
	- BasicBlock *CurrentHeader,
	+ BasicBlock *BlockInLoop,
	bool NeedConditional) {
	auto *Term = cast<BranchInst>(Src->getTerminator());
	if (NeedConditional) {
	// Update the conditional branch's successor for the following
	// iteration.
	Term->setSuccessor(!ContinueOnTrue, Dest);
	} else {
	// Remove phi operands at this loop exit
	if (Dest != LoopExit) {
	BasicBlock *BB = Src;
	for (BasicBlock *Succ : successors(BB)) {
	- if (Succ == CurrentHeader)
	+ // Preserve the incoming value from BB if we are jumping to the block
	+ // in the current loop.
	+ if (Succ == BlockInLoop)
	continue;
	for (PHINode &Phi : Succ->phis())
	Phi.removeIncomingValue(BB, false);
	}
	}
	// Replace the conditional branch with an unconditional one.
	BranchInst::Create(Dest, Term);
	Term->eraseFromParent();
	}
	};

	// Now that all the basic blocks for the unrolled iterations are in place,
	// set up the branches to connect them.
	if (LatchIsExiting) {
	// Set up latches to branch to the new header in the unrolled iterations or
	// the loop exit for the last latch in a fully unrolled loop.
	for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
	// The branch destination.
	unsigned j = (i + 1) % e;
	BasicBlock *Dest = Headers[j];
	bool NeedConditional = true;

	if (RuntimeTripCount && j != 0) {
	NeedConditional = false;
	}

	// For a complete unroll, make the last iteration end with a branch
	// to the exit block.
	if (CompletelyUnroll) {
	if (j == 0)
	Dest = LoopExit;
	// If using trip count upper bound to completely unroll, we need to keep
	// the conditional branch except the last one because the loop may exit
	// after any iteration.
	assert(NeedConditional &&
	"NeedCondition cannot be modified by both complete "
	"unrolling and runtime unrolling");
	NeedConditional =
	(ULO.PreserveCondBr && j && !(ULO.PreserveOnlyFirst && i != 0));
	} else if (j != BreakoutTrip &&
	(ULO.TripMultiple == 0 \|\| j % ULO.TripMultiple != 0)) {
	// If we know the trip count or a multiple of it, we can safely use an
	// unconditional branch for some iterations.
	NeedConditional = false;
	}

	setDest(Latches[i], Dest, Headers, Headers[i], NeedConditional);
	}
	} else {
	// Setup headers to branch to their new successors in the unrolled
	// iterations.
	for (unsigned i = 0, e = Headers.size(); i != e; ++i) {
	// The branch destination.
	unsigned j = (i + 1) % e;
	BasicBlock *Dest = HeaderSucc[i];
	bool NeedConditional = true;

	if (RuntimeTripCount && j != 0)
	NeedConditional = false;

	if (CompletelyUnroll)
	// We cannot drop the conditional branch for the last condition, as we
	// may have to execute the loop body depending on the condition.
	NeedConditional = j == 0 \|\| ULO.PreserveCondBr;
	else if (j != BreakoutTrip &&
	(ULO.TripMultiple == 0 \|\| j % ULO.TripMultiple != 0))
	// If we know the trip count or a multiple of it, we can safely use an
	// unconditional branch for some iterations.
	NeedConditional = false;

	- setDest(Headers[i], Dest, Headers, Headers[i], NeedConditional);
	+ setDest(Headers[i], Dest, Headers, HeaderSucc[i], NeedConditional);
	}

	// Set up latches to branch to the new header in the unrolled iterations or
	// the loop exit for the last latch in a fully unrolled loop.

	for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
	// The original branch was replicated in each unrolled iteration.
	BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());

	// The branch destination.
	unsigned j = (i + 1) % e;
	BasicBlock *Dest = Headers[j];

	// When completely unrolling, the last latch becomes unreachable.
	if (CompletelyUnroll && j == 0)
	new UnreachableInst(Term->getContext(), Term);
	else
	// Replace the conditional branch with an unconditional one.
	BranchInst::Create(Dest, Term);

	Term->eraseFromParent();
	}
	}

	// Update dominators of blocks we might reach through exits.
	// Immediate dominator of such block might change, because we add more
	// routes which can lead to the exit: we can now reach it from the copied
	// iterations too.
	if (DT && ULO.Count > 1) {
	for (auto *BB : OriginalLoopBlocks) {
	auto *BBDomNode = DT->getNode(BB);
	SmallVector<BasicBlock *, 16> ChildrenToUpdate;
	for (auto *ChildDomNode : BBDomNode->getChildren()) {
	auto *ChildBB = ChildDomNode->getBlock();
	if (!L->contains(ChildBB))
	ChildrenToUpdate.push_back(ChildBB);
	}
	BasicBlock *NewIDom;
	BasicBlock *&TermBlock = LatchIsExiting ? LatchBlock : Header;
	auto &TermBlocks = LatchIsExiting ? Latches : Headers;
	if (BB == TermBlock) {
	// The latch is special because we emit unconditional branches in
	// some cases where the original loop contained a conditional branch.
	// Since the latch is always at the bottom of the loop, if the latch
	// dominated an exit before unrolling, the new dominator of that exit
	// must also be a latch. Specifically, the dominator is the first
	// latch which ends in a conditional branch, or the last latch if
	// there is no such latch.
	// For loops exiting from the header, we limit the supported loops
	// to have a single exiting block.
	NewIDom = TermBlocks.back();
	for (BasicBlock *Iter : TermBlocks) {
	Instruction *Term = Iter->getTerminator();
	if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
	NewIDom = Iter;
	break;
	}
	}
	} else {
	// The new idom of the block will be the nearest common dominator
	// of all copies of the previous idom. This is equivalent to the
	// nearest common dominator of the previous idom and the first latch,
	// which dominates all copies of the previous idom.
	NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
	}
	for (auto *ChildBB : ChildrenToUpdate)
	DT->changeImmediateDominator(ChildBB, NewIDom);
	}
	}

	assert(!DT \|\| !UnrollVerifyDomtree \|\|
	DT->verify(DominatorTree::VerificationLevel::Fast));

	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
	// Merge adjacent basic blocks, if possible.
	for (BasicBlock *Latch : Latches) {
	BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());
	assert((Term \|\|
	(CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) &&
	"Need a branch as terminator, except when fully unrolling with "
	"unconditional latch");
	if (Term && Term->isUnconditional()) {
	BasicBlock *Dest = Term->getSuccessor(0);
	BasicBlock *Fold = Dest->getUniquePredecessor();
	if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
	// Dest has been folded into Fold. Update our worklists accordingly.
	std::replace(Latches.begin(), Latches.end(), Dest, Fold);
	UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
	UnrolledLoopBlocks.end(), Dest),
	UnrolledLoopBlocks.end());
	}
	}
	}

	// At this point, the code is well formed. We now simplify the unrolled loop,
	// doing constant propagation and dead code elimination as we go.
	simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 \|\| Peeled), LI,
	SE, DT, AC);

	NumCompletelyUnrolled += CompletelyUnroll;
	++NumUnrolled;

	Loop *OuterL = L->getParentLoop();
	// Update LoopInfo if the loop is completely removed.
	if (CompletelyUnroll)
	LI->erase(L);

	// After complete unrolling most of the blocks should be contained in OuterL.
	// However, some of them might happen to be out of OuterL (e.g. if they
	// precede a loop exit). In this case we might need to insert PHI nodes in
	// order to preserve LCSSA form.
	// We don't need to check this if we already know that we need to fix LCSSA
	// form.
	// TODO: For now we just recompute LCSSA for the outer loop in this case, but
	// it should be possible to fix it in-place.
	if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
	NeedToFixLCSSA \|= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);

	// If we have a pass and a DominatorTree we should re-simplify impacted loops
	// to ensure subsequent analyses can rely on this form. We want to simplify
	// at least one layer outside of the loop that was unrolled so that any
	// changes to the parent loop exposed by the unrolling are considered.
	if (DT) {
	if (OuterL) {
	// OuterL includes all loops for which we can break loop-simplify, so
	// it's sufficient to simplify only it (it'll recursively simplify inner
	// loops too).
	if (NeedToFixLCSSA) {
	// LCSSA must be performed on the outermost affected loop. The unrolled
	// loop's last loop latch is guaranteed to be in the outermost loop
	// after LoopInfo's been updated by LoopInfo::erase.
	Loop *LatchLoop = LI->getLoopFor(Latches.back());
	Loop *FixLCSSALoop = OuterL;
	if (!FixLCSSALoop->contains(LatchLoop))
	while (FixLCSSALoop->getParentLoop() != LatchLoop)
	FixLCSSALoop = FixLCSSALoop->getParentLoop();

	formLCSSARecursively(FixLCSSALoop, DT, LI, SE);
	} else if (PreserveLCSSA) {
	assert(OuterL->isLCSSAForm(*DT) &&
	"Loops should be in LCSSA form after loop-unroll.");
	}

	// TODO: That potentially might be compile-time expensive. We should try
	// to fix the loop-simplified form incrementally.
	simplifyLoop(OuterL, DT, LI, SE, AC, nullptr, PreserveLCSSA);
	} else {
	// Simplify loops for which we might've broken loop-simplify form.
	for (Loop *SubLoop : LoopsToSimplify)
	simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA);
	}
	}

	return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
	: LoopUnrollResult::PartiallyUnrolled;
	}

	/// Given an llvm.loop loop id metadata node, returns the loop hint metadata
	/// node with the given name (for example, "llvm.loop.unroll.count"). If no
	/// such metadata node exists, then nullptr is returned.
	MDNode llvm::GetUnrollMetadata(MDNode LoopID, StringRef Name) {
	// First operand should refer to the loop id itself.
	assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
	assert(LoopID->getOperand(0) == LoopID && "invalid loop id");

	for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
	MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
	if (!MD)
	continue;

	MDString *S = dyn_cast<MDString>(MD->getOperand(0));
	if (!S)
	continue;

	if (Name.equals(S->getString()))
	return MD;
	}
	return nullptr;
	}
	Index: projects/clang900-import/contrib/llvm/tools/clang/include/clang/Frontend/LangStandards.def
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/clang/include/clang/Frontend/LangStandards.def (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/clang/include/clang/Frontend/LangStandards.def (revision 351722)
	@@ -1,189 +1,189 @@
	//===-- LangStandards.def - Language Standard Data --------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LANGSTANDARD
	#error "LANGSTANDARD must be defined before including this file"
	#endif

	/// LANGSTANDARD(IDENT, NAME, LANG, DESC, FEATURES)
	///
	/// \param IDENT - The name of the standard as a C++ identifier.
	/// \param NAME - The name of the standard.
	/// \param LANG - The InputKind::Language for which this is a standard.
	/// \param DESC - A short description of the standard.
	/// \param FEATURES - The standard features as flags, these are enums from the
	/// clang::frontend namespace, which is assumed to be be available.

	/// LANGSTANDARD_ALIAS(IDENT, ALIAS)
	/// \param IDENT - The name of the standard as a C++ identifier.
	/// \param ALIAS - The alias of the standard.

	/// LANGSTANDARD_ALIAS_DEPR(IDENT, ALIAS)
	/// Same as LANGSTANDARD_ALIAS, but for a deprecated alias.

	#ifndef LANGSTANDARD_ALIAS
	#define LANGSTANDARD_ALIAS(IDENT, ALIAS)
	#endif

	#ifndef LANGSTANDARD_ALIAS_DEPR
	#define LANGSTANDARD_ALIAS_DEPR(IDENT, ALIAS) LANGSTANDARD_ALIAS(IDENT, ALIAS)
	#endif

	// C89-ish modes.
	LANGSTANDARD(c89, "c89",
	C, "ISO C 1990",
	ImplicitInt)
	LANGSTANDARD_ALIAS(c89, "c90")
	LANGSTANDARD_ALIAS(c89, "iso9899:1990")

	LANGSTANDARD(c94, "iso9899:199409",
	C, "ISO C 1990 with amendment 1",
	Digraphs \| ImplicitInt)

	LANGSTANDARD(gnu89, "gnu89",
	C, "ISO C 1990 with GNU extensions",
	LineComment \| Digraphs \| GNUMode \| ImplicitInt)
	LANGSTANDARD_ALIAS(gnu89, "gnu90")

	// C99-ish modes
	LANGSTANDARD(c99, "c99",
	C, "ISO C 1999",
	LineComment \| C99 \| Digraphs \| HexFloat)
	LANGSTANDARD_ALIAS(c99, "iso9899:1999")
	LANGSTANDARD_ALIAS_DEPR(c99, "c9x")
	LANGSTANDARD_ALIAS_DEPR(c99, "iso9899:199x")

	LANGSTANDARD(gnu99, "gnu99",
	C, "ISO C 1999 with GNU extensions",
	LineComment \| C99 \| Digraphs \| GNUMode \| HexFloat)
	LANGSTANDARD_ALIAS_DEPR(gnu99, "gnu9x")

	// C11 modes
	LANGSTANDARD(c11, "c11",
	C, "ISO C 2011",
	LineComment \| C99 \| C11 \| Digraphs \| HexFloat)
	LANGSTANDARD_ALIAS(c11, "iso9899:2011")
	LANGSTANDARD_ALIAS_DEPR(c11, "c1x")
	LANGSTANDARD_ALIAS_DEPR(c11, "iso9899:201x")

	LANGSTANDARD(gnu11, "gnu11",
	C, "ISO C 2011 with GNU extensions",
	LineComment \| C99 \| C11 \| Digraphs \| GNUMode \| HexFloat)
	LANGSTANDARD_ALIAS_DEPR(gnu11, "gnu1x")

	// C17 modes
	LANGSTANDARD(c17, "c17",
	C, "ISO C 2017",
	LineComment \| C99 \| C11 \| C17 \| Digraphs \| HexFloat)
	LANGSTANDARD_ALIAS(c17, "iso9899:2017")
	LANGSTANDARD_ALIAS(c17, "c18")
	LANGSTANDARD_ALIAS(c17, "iso9899:2018")
	LANGSTANDARD(gnu17, "gnu17",
	C, "ISO C 2017 with GNU extensions",
	LineComment \| C99 \| C11 \| C17 \| Digraphs \| GNUMode \| HexFloat)
	LANGSTANDARD_ALIAS(gnu17, "gnu18")

	// C2x modes
	LANGSTANDARD(c2x, "c2x",
	C, "Working Draft for ISO C2x",
	LineComment \| C99 \| C11 \| C17 \| C2x \| Digraphs \| HexFloat)
	LANGSTANDARD(gnu2x, "gnu2x",
	C, "Working Draft for ISO C2x with GNU extensions",
	LineComment \| C99 \| C11 \| C17 \| C2x \| Digraphs \| GNUMode \| HexFloat)

	// C++ modes
	LANGSTANDARD(cxx98, "c++98",
	CXX, "ISO C++ 1998 with amendments",
	LineComment \| CPlusPlus \| Digraphs)
	LANGSTANDARD_ALIAS(cxx98, "c++03")

	LANGSTANDARD(gnucxx98, "gnu++98",
	CXX, "ISO C++ 1998 with amendments and GNU extensions",
	LineComment \| CPlusPlus \| Digraphs \| GNUMode)
	LANGSTANDARD_ALIAS(gnucxx98, "gnu++03")

	LANGSTANDARD(cxx11, "c++11",
	CXX, "ISO C++ 2011 with amendments",
	LineComment \| CPlusPlus \| CPlusPlus11 \| Digraphs)
	LANGSTANDARD_ALIAS_DEPR(cxx11, "c++0x")

	LANGSTANDARD(gnucxx11, "gnu++11", CXX,
	"ISO C++ 2011 with amendments and GNU extensions",
	LineComment \| CPlusPlus \| CPlusPlus11 \| Digraphs \| GNUMode)
	LANGSTANDARD_ALIAS_DEPR(gnucxx11, "gnu++0x")

	LANGSTANDARD(cxx14, "c++14",
	CXX, "ISO C++ 2014 with amendments",
	LineComment \| CPlusPlus \| CPlusPlus11 \| CPlusPlus14 \| Digraphs)
	LANGSTANDARD_ALIAS_DEPR(cxx14, "c++1y")

	LANGSTANDARD(gnucxx14, "gnu++14",
	CXX, "ISO C++ 2014 with amendments and GNU extensions",
	LineComment \| CPlusPlus \| CPlusPlus11 \| CPlusPlus14 \| Digraphs \|
	GNUMode)
	LANGSTANDARD_ALIAS_DEPR(gnucxx14, "gnu++1y")

	LANGSTANDARD(cxx17, "c++17",
	CXX, "ISO C++ 2017 with amendments",
	LineComment \| CPlusPlus \| CPlusPlus11 \| CPlusPlus14 \| CPlusPlus17 \|
	Digraphs \| HexFloat)
	LANGSTANDARD_ALIAS_DEPR(cxx17, "c++1z")

	LANGSTANDARD(gnucxx17, "gnu++17",
	CXX, "ISO C++ 2017 with amendments and GNU extensions",
	LineComment \| CPlusPlus \| CPlusPlus11 \| CPlusPlus14 \| CPlusPlus17 \|
	Digraphs \| HexFloat \| GNUMode)
	LANGSTANDARD_ALIAS_DEPR(gnucxx17, "gnu++1z")

	LANGSTANDARD(cxx2a, "c++2a",
	CXX, "Working draft for ISO C++ 2020",
	LineComment \| CPlusPlus \| CPlusPlus11 \| CPlusPlus14 \| CPlusPlus17 \|
	CPlusPlus2a \| Digraphs \| HexFloat)

	LANGSTANDARD(gnucxx2a, "gnu++2a",
	CXX, "Working draft for ISO C++ 2020 with GNU extensions",
	LineComment \| CPlusPlus \| CPlusPlus11 \| CPlusPlus14 \| CPlusPlus17 \|
	CPlusPlus2a \| Digraphs \| HexFloat \| GNUMode)

	// OpenCL
	LANGSTANDARD(opencl10, "cl1.0",
	OpenCL, "OpenCL 1.0",
	LineComment \| C99 \| Digraphs \| HexFloat \| OpenCL)
	LANGSTANDARD_ALIAS_DEPR(opencl10, "cl")

	LANGSTANDARD(opencl11, "cl1.1",
	OpenCL, "OpenCL 1.1",
	LineComment \| C99 \| Digraphs \| HexFloat \| OpenCL)
	LANGSTANDARD(opencl12, "cl1.2",
	OpenCL, "OpenCL 1.2",
	LineComment \| C99 \| Digraphs \| HexFloat \| OpenCL)
	LANGSTANDARD(opencl20, "cl2.0",
	OpenCL, "OpenCL 2.0",
	LineComment \| C99 \| Digraphs \| HexFloat \| OpenCL)
	-LANGSTANDARD(openclcpp, "c++",
	+LANGSTANDARD(openclcpp, "clc++",
	OpenCL, "C++ for OpenCL",
	LineComment \| CPlusPlus \| CPlusPlus11 \| CPlusPlus14 \| CPlusPlus17 \|
	Digraphs \| HexFloat \| OpenCL)

	LANGSTANDARD_ALIAS_DEPR(opencl10, "CL")
	LANGSTANDARD_ALIAS_DEPR(opencl11, "CL1.1")
	LANGSTANDARD_ALIAS_DEPR(opencl12, "CL1.2")
	LANGSTANDARD_ALIAS_DEPR(opencl20, "CL2.0")
	LANGSTANDARD_ALIAS_DEPR(openclcpp, "CLC++")

	// CUDA
	LANGSTANDARD(cuda, "cuda", CUDA, "NVIDIA CUDA(tm)",
	LineComment \| CPlusPlus \| Digraphs)

	// HIP
	LANGSTANDARD(hip, "hip", HIP, "HIP",
	LineComment \| CPlusPlus \| Digraphs)

	#undef LANGSTANDARD
	#undef LANGSTANDARD_ALIAS
	#undef LANGSTANDARD_ALIAS_DEPR
	Index: projects/clang900-import/contrib/llvm/tools/clang/lib/Basic/Targets/RISCV.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/clang/lib/Basic/Targets/RISCV.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/clang/lib/Basic/Targets/RISCV.cpp (revision 351722)
	@@ -1,136 +1,140 @@
	//===--- RISCV.cpp - Implement RISCV target feature support ---------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements RISCV TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#include "RISCV.h"
	#include "clang/Basic/MacroBuilder.h"
	#include "llvm/ADT/StringSwitch.h"

	using namespace clang;
	using namespace clang::targets;

	ArrayRef<const char *> RISCVTargetInfo::getGCCRegNames() const {
	static const char *const GCCRegNames[] = {
	"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
	"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
	"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
	"x24", "x25", "x26", "x27", "x28", "x29", "x30", "x31"};
	return llvm::makeArrayRef(GCCRegNames);
	}

	ArrayRef<TargetInfo::GCCRegAlias> RISCVTargetInfo::getGCCRegAliases() const {
	static const TargetInfo::GCCRegAlias GCCRegAliases[] = {
	{{"zero"}, "x0"}, {{"ra"}, "x1"}, {{"sp"}, "x2"}, {{"gp"}, "x3"},
	{{"tp"}, "x4"}, {{"t0"}, "x5"}, {{"t1"}, "x6"}, {{"t2"}, "x7"},
	{{"s0"}, "x8"}, {{"s1"}, "x9"}, {{"a0"}, "x10"}, {{"a1"}, "x11"},
	{{"a2"}, "x12"}, {{"a3"}, "x13"}, {{"a4"}, "x14"}, {{"a5"}, "x15"},
	{{"a6"}, "x16"}, {{"a7"}, "x17"}, {{"s2"}, "x18"}, {{"s3"}, "x19"},
	{{"s4"}, "x20"}, {{"s5"}, "x21"}, {{"s6"}, "x22"}, {{"s7"}, "x23"},
	{{"s8"}, "x24"}, {{"s9"}, "x25"}, {{"s10"}, "x26"}, {{"s11"}, "x27"},
	{{"t3"}, "x28"}, {{"t4"}, "x29"}, {{"t5"}, "x30"}, {{"t6"}, "x31"}};
	return llvm::makeArrayRef(GCCRegAliases);
	}

	bool RISCVTargetInfo::validateAsmConstraint(
	const char *&Name, TargetInfo::ConstraintInfo &Info) const {
	switch (*Name) {
	default:
	return false;
	case 'I':
	// A 12-bit signed immediate.
	Info.setRequiresImmediate(-2048, 2047);
	return true;
	case 'J':
	// Integer zero.
	Info.setRequiresImmediate(0);
	return true;
	case 'K':
	// A 5-bit unsigned immediate for CSR access instructions.
	Info.setRequiresImmediate(0, 31);
	return true;
	case 'f':
	// A floating-point register.
	Info.setAllowsRegister();
	return true;
	+ case 'A':
	+ // An address that is held in a general-purpose register.
	+ Info.setAllowsMemory();
	+ return true;
	}
	}

	void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__riscv");
	bool Is64Bit = getTriple().getArch() == llvm::Triple::riscv64;
	Builder.defineMacro("__riscv_xlen", Is64Bit ? "64" : "32");
	// TODO: modify when more code models are supported.
	Builder.defineMacro("__riscv_cmodel_medlow");

	StringRef ABIName = getABI();
	if (ABIName == "ilp32f" \|\| ABIName == "lp64f")
	Builder.defineMacro("__riscv_float_abi_single");
	else if (ABIName == "ilp32d" \|\| ABIName == "lp64d")
	Builder.defineMacro("__riscv_float_abi_double");
	else if (ABIName == "ilp32e")
	Builder.defineMacro("__riscv_abi_rve");
	else
	Builder.defineMacro("__riscv_float_abi_soft");

	if (HasM) {
	Builder.defineMacro("__riscv_mul");
	Builder.defineMacro("__riscv_div");
	Builder.defineMacro("__riscv_muldiv");
	}

	if (HasA)
	Builder.defineMacro("__riscv_atomic");

	if (HasF \|\| HasD) {
	Builder.defineMacro("__riscv_flen", HasD ? "64" : "32");
	Builder.defineMacro("__riscv_fdiv");
	Builder.defineMacro("__riscv_fsqrt");
	}

	if (HasC)
	Builder.defineMacro("__riscv_compressed");
	}

	/// Return true if has this feature, need to sync with handleTargetFeatures.
	bool RISCVTargetInfo::hasFeature(StringRef Feature) const {
	bool Is64Bit = getTriple().getArch() == llvm::Triple::riscv64;
	return llvm::StringSwitch<bool>(Feature)
	.Case("riscv", true)
	.Case("riscv32", !Is64Bit)
	.Case("riscv64", Is64Bit)
	.Case("m", HasM)
	.Case("a", HasA)
	.Case("f", HasF)
	.Case("d", HasD)
	.Case("c", HasC)
	.Default(false);
	}

	/// Perform initialization based on the user configured set of features.
	bool RISCVTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) {
	for (const auto &Feature : Features) {
	if (Feature == "+m")
	HasM = true;
	else if (Feature == "+a")
	HasA = true;
	else if (Feature == "+f")
	HasF = true;
	else if (Feature == "+d")
	HasD = true;
	else if (Feature == "+c")
	HasC = true;
	}

	return true;
	}
	Index: projects/clang900-import/contrib/llvm/tools/clang/lib/Basic/Targets/RISCV.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/clang/lib/Basic/Targets/RISCV.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/clang/lib/Basic/Targets/RISCV.h (revision 351722)
	@@ -1,117 +1,131 @@
	//===--- RISCV.h - Declare RISCV target feature support ---------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares RISCV TargetInfo objects.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_RISCV_H
	#define LLVM_CLANG_LIB_BASIC_TARGETS_RISCV_H

	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/TargetOptions.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Support/Compiler.h"

	namespace clang {
	namespace targets {

	// RISC-V Target
	class RISCVTargetInfo : public TargetInfo {
	protected:
	std::string ABI;
	bool HasM;
	bool HasA;
	bool HasF;
	bool HasD;
	bool HasC;

	public:
	RISCVTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple), HasM(false), HasA(false), HasF(false),
	HasD(false), HasC(false) {
	LongDoubleWidth = 128;
	LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::IEEEquad();
	SuitableAlign = 128;
	WCharType = SignedInt;
	WIntType = UnsignedInt;
	}

	StringRef getABI() const override { return ABI; }
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;

	ArrayRef<Builtin::Info> getTargetBuiltins() const override { return None; }

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}

	const char *getClobbers() const override { return ""; }

	ArrayRef<const char *> getGCCRegNames() const override;

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0)
	return 10;
	else if (RegNo == 1)
	return 11;
	else
	return -1;
	}

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override;

	bool hasFeature(StringRef Feature) const override;

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override;
	};
	class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo {
	public:
	RISCV32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: RISCVTargetInfo(Triple, Opts) {
	IntPtrType = SignedInt;
	PtrDiffType = SignedInt;
	SizeType = UnsignedInt;
	resetDataLayout("e-m:e-p:32:32-i64:64-n32-S128");
	}

	bool setABI(const std::string &Name) override {
	if (Name == "ilp32" \|\| Name == "ilp32f" \|\| Name == "ilp32d") {
	ABI = Name;
	return true;
	}
	return false;
	}
	+
	+ void setMaxAtomicWidth() override {
	+ MaxAtomicPromoteWidth = 128;
	+
	+ if (HasA)
	+ MaxAtomicInlineWidth = 32;
	+ }
	};
	class LLVM_LIBRARY_VISIBILITY RISCV64TargetInfo : public RISCVTargetInfo {
	public:
	RISCV64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: RISCVTargetInfo(Triple, Opts) {
	LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
	IntMaxType = Int64Type = SignedLong;
	resetDataLayout("e-m:e-p:64:64-i64:64-i128:128-n64-S128");
	}

	bool setABI(const std::string &Name) override {
	if (Name == "lp64" \|\| Name == "lp64f" \|\| Name == "lp64d") {
	ABI = Name;
	return true;
	}
	return false;
	+ }
	+
	+ void setMaxAtomicWidth() override {
	+ MaxAtomicPromoteWidth = 128;
	+
	+ if (HasA)
	+ MaxAtomicInlineWidth = 64;
	}
	};
	} // namespace targets
	} // namespace clang

	#endif // LLVM_CLANG_LIB_BASIC_TARGETS_RISCV_H
	Index: projects/clang900-import/contrib/llvm/tools/clang/lib/Headers/opencl-c.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/clang/lib/Headers/opencl-c.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/clang/lib/Headers/opencl-c.h (revision 351722)
	@@ -1,16502 +1,16502 @@
	//===--- opencl-c.h - OpenCL C language builtin function header -----------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef _OPENCL_H_
	#define _OPENCL_H_

	#include "opencl-c-base.h"

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	#ifndef cl_khr_depth_images
	#define cl_khr_depth_images
	#endif //cl_khr_depth_images
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
	#ifdef cl_khr_3d_image_writes
	#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
	#endif //cl_khr_3d_image_writes
	#endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
	#pragma OPENCL EXTENSION cl_intel_planar_yuv : begin
	#pragma OPENCL EXTENSION cl_intel_planar_yuv : end
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)

	#define __ovld __attribute__((overloadable))
	#define __conv __attribute__((convergent))

	// Optimizations
	#define __purefn __attribute__((pure))
	#define __cnfn __attribute__((const))


	// OpenCL v1.1/1.2/2.0 s6.2.3 - Explicit conversions

	char __ovld __cnfn convert_char_rte(char);
	char __ovld __cnfn convert_char_sat_rte(char);
	char __ovld __cnfn convert_char_rtz(char);
	char __ovld __cnfn convert_char_sat_rtz(char);
	char __ovld __cnfn convert_char_rtp(char);
	char __ovld __cnfn convert_char_sat_rtp(char);
	char __ovld __cnfn convert_char_rtn(char);
	char __ovld __cnfn convert_char_sat_rtn(char);
	char __ovld __cnfn convert_char(char);
	char __ovld __cnfn convert_char_sat(char);
	char __ovld __cnfn convert_char_rte(uchar);
	char __ovld __cnfn convert_char_sat_rte(uchar);
	char __ovld __cnfn convert_char_rtz(uchar);
	char __ovld __cnfn convert_char_sat_rtz(uchar);
	char __ovld __cnfn convert_char_rtp(uchar);
	char __ovld __cnfn convert_char_sat_rtp(uchar);
	char __ovld __cnfn convert_char_rtn(uchar);
	char __ovld __cnfn convert_char_sat_rtn(uchar);
	char __ovld __cnfn convert_char(uchar);
	char __ovld __cnfn convert_char_sat(uchar);
	char __ovld __cnfn convert_char_rte(short);
	char __ovld __cnfn convert_char_sat_rte(short);
	char __ovld __cnfn convert_char_rtz(short);
	char __ovld __cnfn convert_char_sat_rtz(short);
	char __ovld __cnfn convert_char_rtp(short);
	char __ovld __cnfn convert_char_sat_rtp(short);
	char __ovld __cnfn convert_char_rtn(short);
	char __ovld __cnfn convert_char_sat_rtn(short);
	char __ovld __cnfn convert_char(short);
	char __ovld __cnfn convert_char_sat(short);
	char __ovld __cnfn convert_char_rte(ushort);
	char __ovld __cnfn convert_char_sat_rte(ushort);
	char __ovld __cnfn convert_char_rtz(ushort);
	char __ovld __cnfn convert_char_sat_rtz(ushort);
	char __ovld __cnfn convert_char_rtp(ushort);
	char __ovld __cnfn convert_char_sat_rtp(ushort);
	char __ovld __cnfn convert_char_rtn(ushort);
	char __ovld __cnfn convert_char_sat_rtn(ushort);
	char __ovld __cnfn convert_char(ushort);
	char __ovld __cnfn convert_char_sat(ushort);
	char __ovld __cnfn convert_char_rte(int);
	char __ovld __cnfn convert_char_sat_rte(int);
	char __ovld __cnfn convert_char_rtz(int);
	char __ovld __cnfn convert_char_sat_rtz(int);
	char __ovld __cnfn convert_char_rtp(int);
	char __ovld __cnfn convert_char_sat_rtp(int);
	char __ovld __cnfn convert_char_rtn(int);
	char __ovld __cnfn convert_char_sat_rtn(int);
	char __ovld __cnfn convert_char(int);
	char __ovld __cnfn convert_char_sat(int);
	char __ovld __cnfn convert_char_rte(uint);
	char __ovld __cnfn convert_char_sat_rte(uint);
	char __ovld __cnfn convert_char_rtz(uint);
	char __ovld __cnfn convert_char_sat_rtz(uint);
	char __ovld __cnfn convert_char_rtp(uint);
	char __ovld __cnfn convert_char_sat_rtp(uint);
	char __ovld __cnfn convert_char_rtn(uint);
	char __ovld __cnfn convert_char_sat_rtn(uint);
	char __ovld __cnfn convert_char(uint);
	char __ovld __cnfn convert_char_sat(uint);
	char __ovld __cnfn convert_char_rte(long);
	char __ovld __cnfn convert_char_sat_rte(long);
	char __ovld __cnfn convert_char_rtz(long);
	char __ovld __cnfn convert_char_sat_rtz(long);
	char __ovld __cnfn convert_char_rtp(long);
	char __ovld __cnfn convert_char_sat_rtp(long);
	char __ovld __cnfn convert_char_rtn(long);
	char __ovld __cnfn convert_char_sat_rtn(long);
	char __ovld __cnfn convert_char(long);
	char __ovld __cnfn convert_char_sat(long);
	char __ovld __cnfn convert_char_rte(ulong);
	char __ovld __cnfn convert_char_sat_rte(ulong);
	char __ovld __cnfn convert_char_rtz(ulong);
	char __ovld __cnfn convert_char_sat_rtz(ulong);
	char __ovld __cnfn convert_char_rtp(ulong);
	char __ovld __cnfn convert_char_sat_rtp(ulong);
	char __ovld __cnfn convert_char_rtn(ulong);
	char __ovld __cnfn convert_char_sat_rtn(ulong);
	char __ovld __cnfn convert_char(ulong);
	char __ovld __cnfn convert_char_sat(ulong);
	char __ovld __cnfn convert_char_rte(float);
	char __ovld __cnfn convert_char_sat_rte(float);
	char __ovld __cnfn convert_char_rtz(float);
	char __ovld __cnfn convert_char_sat_rtz(float);
	char __ovld __cnfn convert_char_rtp(float);
	char __ovld __cnfn convert_char_sat_rtp(float);
	char __ovld __cnfn convert_char_rtn(float);
	char __ovld __cnfn convert_char_sat_rtn(float);
	char __ovld __cnfn convert_char(float);
	char __ovld __cnfn convert_char_sat(float);
	uchar __ovld __cnfn convert_uchar_rte(char);
	uchar __ovld __cnfn convert_uchar_sat_rte(char);
	uchar __ovld __cnfn convert_uchar_rtz(char);
	uchar __ovld __cnfn convert_uchar_sat_rtz(char);
	uchar __ovld __cnfn convert_uchar_rtp(char);
	uchar __ovld __cnfn convert_uchar_sat_rtp(char);
	uchar __ovld __cnfn convert_uchar_rtn(char);
	uchar __ovld __cnfn convert_uchar_sat_rtn(char);
	uchar __ovld __cnfn convert_uchar(char);
	uchar __ovld __cnfn convert_uchar_sat(char);
	uchar __ovld __cnfn convert_uchar_rte(uchar);
	uchar __ovld __cnfn convert_uchar_sat_rte(uchar);
	uchar __ovld __cnfn convert_uchar_rtz(uchar);
	uchar __ovld __cnfn convert_uchar_sat_rtz(uchar);
	uchar __ovld __cnfn convert_uchar_rtp(uchar);
	uchar __ovld __cnfn convert_uchar_sat_rtp(uchar);
	uchar __ovld __cnfn convert_uchar_rtn(uchar);
	uchar __ovld __cnfn convert_uchar_sat_rtn(uchar);
	uchar __ovld __cnfn convert_uchar(uchar);
	uchar __ovld __cnfn convert_uchar_sat(uchar);
	uchar __ovld __cnfn convert_uchar_rte(short);
	uchar __ovld __cnfn convert_uchar_sat_rte(short);
	uchar __ovld __cnfn convert_uchar_rtz(short);
	uchar __ovld __cnfn convert_uchar_sat_rtz(short);
	uchar __ovld __cnfn convert_uchar_rtp(short);
	uchar __ovld __cnfn convert_uchar_sat_rtp(short);
	uchar __ovld __cnfn convert_uchar_rtn(short);
	uchar __ovld __cnfn convert_uchar_sat_rtn(short);
	uchar __ovld __cnfn convert_uchar(short);
	uchar __ovld __cnfn convert_uchar_sat(short);
	uchar __ovld __cnfn convert_uchar_rte(ushort);
	uchar __ovld __cnfn convert_uchar_sat_rte(ushort);
	uchar __ovld __cnfn convert_uchar_rtz(ushort);
	uchar __ovld __cnfn convert_uchar_sat_rtz(ushort);
	uchar __ovld __cnfn convert_uchar_rtp(ushort);
	uchar __ovld __cnfn convert_uchar_sat_rtp(ushort);
	uchar __ovld __cnfn convert_uchar_rtn(ushort);
	uchar __ovld __cnfn convert_uchar_sat_rtn(ushort);
	uchar __ovld __cnfn convert_uchar(ushort);
	uchar __ovld __cnfn convert_uchar_sat(ushort);
	uchar __ovld __cnfn convert_uchar_rte(int);
	uchar __ovld __cnfn convert_uchar_sat_rte(int);
	uchar __ovld __cnfn convert_uchar_rtz(int);
	uchar __ovld __cnfn convert_uchar_sat_rtz(int);
	uchar __ovld __cnfn convert_uchar_rtp(int);
	uchar __ovld __cnfn convert_uchar_sat_rtp(int);
	uchar __ovld __cnfn convert_uchar_rtn(int);
	uchar __ovld __cnfn convert_uchar_sat_rtn(int);
	uchar __ovld __cnfn convert_uchar(int);
	uchar __ovld __cnfn convert_uchar_sat(int);
	uchar __ovld __cnfn convert_uchar_rte(uint);
	uchar __ovld __cnfn convert_uchar_sat_rte(uint);
	uchar __ovld __cnfn convert_uchar_rtz(uint);
	uchar __ovld __cnfn convert_uchar_sat_rtz(uint);
	uchar __ovld __cnfn convert_uchar_rtp(uint);
	uchar __ovld __cnfn convert_uchar_sat_rtp(uint);
	uchar __ovld __cnfn convert_uchar_rtn(uint);
	uchar __ovld __cnfn convert_uchar_sat_rtn(uint);
	uchar __ovld __cnfn convert_uchar(uint);
	uchar __ovld __cnfn convert_uchar_sat(uint);
	uchar __ovld __cnfn convert_uchar_rte(long);
	uchar __ovld __cnfn convert_uchar_sat_rte(long);
	uchar __ovld __cnfn convert_uchar_rtz(long);
	uchar __ovld __cnfn convert_uchar_sat_rtz(long);
	uchar __ovld __cnfn convert_uchar_rtp(long);
	uchar __ovld __cnfn convert_uchar_sat_rtp(long);
	uchar __ovld __cnfn convert_uchar_rtn(long);
	uchar __ovld __cnfn convert_uchar_sat_rtn(long);
	uchar __ovld __cnfn convert_uchar(long);
	uchar __ovld __cnfn convert_uchar_sat(long);
	uchar __ovld __cnfn convert_uchar_rte(ulong);
	uchar __ovld __cnfn convert_uchar_sat_rte(ulong);
	uchar __ovld __cnfn convert_uchar_rtz(ulong);
	uchar __ovld __cnfn convert_uchar_sat_rtz(ulong);
	uchar __ovld __cnfn convert_uchar_rtp(ulong);
	uchar __ovld __cnfn convert_uchar_sat_rtp(ulong);
	uchar __ovld __cnfn convert_uchar_rtn(ulong);
	uchar __ovld __cnfn convert_uchar_sat_rtn(ulong);
	uchar __ovld __cnfn convert_uchar(ulong);
	uchar __ovld __cnfn convert_uchar_sat(ulong);
	uchar __ovld __cnfn convert_uchar_rte(float);
	uchar __ovld __cnfn convert_uchar_sat_rte(float);
	uchar __ovld __cnfn convert_uchar_rtz(float);
	uchar __ovld __cnfn convert_uchar_sat_rtz(float);
	uchar __ovld __cnfn convert_uchar_rtp(float);
	uchar __ovld __cnfn convert_uchar_sat_rtp(float);
	uchar __ovld __cnfn convert_uchar_rtn(float);
	uchar __ovld __cnfn convert_uchar_sat_rtn(float);
	uchar __ovld __cnfn convert_uchar(float);
	uchar __ovld __cnfn convert_uchar_sat(float);

	short __ovld __cnfn convert_short_rte(char);
	short __ovld __cnfn convert_short_sat_rte(char);
	short __ovld __cnfn convert_short_rtz(char);
	short __ovld __cnfn convert_short_sat_rtz(char);
	short __ovld __cnfn convert_short_rtp(char);
	short __ovld __cnfn convert_short_sat_rtp(char);
	short __ovld __cnfn convert_short_rtn(char);
	short __ovld __cnfn convert_short_sat_rtn(char);
	short __ovld __cnfn convert_short(char);
	short __ovld __cnfn convert_short_sat(char);
	short __ovld __cnfn convert_short_rte(uchar);
	short __ovld __cnfn convert_short_sat_rte(uchar);
	short __ovld __cnfn convert_short_rtz(uchar);
	short __ovld __cnfn convert_short_sat_rtz(uchar);
	short __ovld __cnfn convert_short_rtp(uchar);
	short __ovld __cnfn convert_short_sat_rtp(uchar);
	short __ovld __cnfn convert_short_rtn(uchar);
	short __ovld __cnfn convert_short_sat_rtn(uchar);
	short __ovld __cnfn convert_short(uchar);
	short __ovld __cnfn convert_short_sat(uchar);
	short __ovld __cnfn convert_short_rte(short);
	short __ovld __cnfn convert_short_sat_rte(short);
	short __ovld __cnfn convert_short_rtz(short);
	short __ovld __cnfn convert_short_sat_rtz(short);
	short __ovld __cnfn convert_short_rtp(short);
	short __ovld __cnfn convert_short_sat_rtp(short);
	short __ovld __cnfn convert_short_rtn(short);
	short __ovld __cnfn convert_short_sat_rtn(short);
	short __ovld __cnfn convert_short(short);
	short __ovld __cnfn convert_short_sat(short);
	short __ovld __cnfn convert_short_rte(ushort);
	short __ovld __cnfn convert_short_sat_rte(ushort);
	short __ovld __cnfn convert_short_rtz(ushort);
	short __ovld __cnfn convert_short_sat_rtz(ushort);
	short __ovld __cnfn convert_short_rtp(ushort);
	short __ovld __cnfn convert_short_sat_rtp(ushort);
	short __ovld __cnfn convert_short_rtn(ushort);
	short __ovld __cnfn convert_short_sat_rtn(ushort);
	short __ovld __cnfn convert_short(ushort);
	short __ovld __cnfn convert_short_sat(ushort);
	short __ovld __cnfn convert_short_rte(int);
	short __ovld __cnfn convert_short_sat_rte(int);
	short __ovld __cnfn convert_short_rtz(int);
	short __ovld __cnfn convert_short_sat_rtz(int);
	short __ovld __cnfn convert_short_rtp(int);
	short __ovld __cnfn convert_short_sat_rtp(int);
	short __ovld __cnfn convert_short_rtn(int);
	short __ovld __cnfn convert_short_sat_rtn(int);
	short __ovld __cnfn convert_short(int);
	short __ovld __cnfn convert_short_sat(int);
	short __ovld __cnfn convert_short_rte(uint);
	short __ovld __cnfn convert_short_sat_rte(uint);
	short __ovld __cnfn convert_short_rtz(uint);
	short __ovld __cnfn convert_short_sat_rtz(uint);
	short __ovld __cnfn convert_short_rtp(uint);
	short __ovld __cnfn convert_short_sat_rtp(uint);
	short __ovld __cnfn convert_short_rtn(uint);
	short __ovld __cnfn convert_short_sat_rtn(uint);
	short __ovld __cnfn convert_short(uint);
	short __ovld __cnfn convert_short_sat(uint);
	short __ovld __cnfn convert_short_rte(long);
	short __ovld __cnfn convert_short_sat_rte(long);
	short __ovld __cnfn convert_short_rtz(long);
	short __ovld __cnfn convert_short_sat_rtz(long);
	short __ovld __cnfn convert_short_rtp(long);
	short __ovld __cnfn convert_short_sat_rtp(long);
	short __ovld __cnfn convert_short_rtn(long);
	short __ovld __cnfn convert_short_sat_rtn(long);
	short __ovld __cnfn convert_short(long);
	short __ovld __cnfn convert_short_sat(long);
	short __ovld __cnfn convert_short_rte(ulong);
	short __ovld __cnfn convert_short_sat_rte(ulong);
	short __ovld __cnfn convert_short_rtz(ulong);
	short __ovld __cnfn convert_short_sat_rtz(ulong);
	short __ovld __cnfn convert_short_rtp(ulong);
	short __ovld __cnfn convert_short_sat_rtp(ulong);
	short __ovld __cnfn convert_short_rtn(ulong);
	short __ovld __cnfn convert_short_sat_rtn(ulong);
	short __ovld __cnfn convert_short(ulong);
	short __ovld __cnfn convert_short_sat(ulong);
	short __ovld __cnfn convert_short_rte(float);
	short __ovld __cnfn convert_short_sat_rte(float);
	short __ovld __cnfn convert_short_rtz(float);
	short __ovld __cnfn convert_short_sat_rtz(float);
	short __ovld __cnfn convert_short_rtp(float);
	short __ovld __cnfn convert_short_sat_rtp(float);
	short __ovld __cnfn convert_short_rtn(float);
	short __ovld __cnfn convert_short_sat_rtn(float);
	short __ovld __cnfn convert_short(float);
	short __ovld __cnfn convert_short_sat(float);
	ushort __ovld __cnfn convert_ushort_rte(char);
	ushort __ovld __cnfn convert_ushort_sat_rte(char);
	ushort __ovld __cnfn convert_ushort_rtz(char);
	ushort __ovld __cnfn convert_ushort_sat_rtz(char);
	ushort __ovld __cnfn convert_ushort_rtp(char);
	ushort __ovld __cnfn convert_ushort_sat_rtp(char);
	ushort __ovld __cnfn convert_ushort_rtn(char);
	ushort __ovld __cnfn convert_ushort_sat_rtn(char);
	ushort __ovld __cnfn convert_ushort(char);
	ushort __ovld __cnfn convert_ushort_sat(char);
	ushort __ovld __cnfn convert_ushort_rte(uchar);
	ushort __ovld __cnfn convert_ushort_sat_rte(uchar);
	ushort __ovld __cnfn convert_ushort_rtz(uchar);
	ushort __ovld __cnfn convert_ushort_sat_rtz(uchar);
	ushort __ovld __cnfn convert_ushort_rtp(uchar);
	ushort __ovld __cnfn convert_ushort_sat_rtp(uchar);
	ushort __ovld __cnfn convert_ushort_rtn(uchar);
	ushort __ovld __cnfn convert_ushort_sat_rtn(uchar);
	ushort __ovld __cnfn convert_ushort(uchar);
	ushort __ovld __cnfn convert_ushort_sat(uchar);
	ushort __ovld __cnfn convert_ushort_rte(short);
	ushort __ovld __cnfn convert_ushort_sat_rte(short);
	ushort __ovld __cnfn convert_ushort_rtz(short);
	ushort __ovld __cnfn convert_ushort_sat_rtz(short);
	ushort __ovld __cnfn convert_ushort_rtp(short);
	ushort __ovld __cnfn convert_ushort_sat_rtp(short);
	ushort __ovld __cnfn convert_ushort_rtn(short);
	ushort __ovld __cnfn convert_ushort_sat_rtn(short);
	ushort __ovld __cnfn convert_ushort(short);
	ushort __ovld __cnfn convert_ushort_sat(short);
	ushort __ovld __cnfn convert_ushort_rte(ushort);
	ushort __ovld __cnfn convert_ushort_sat_rte(ushort);
	ushort __ovld __cnfn convert_ushort_rtz(ushort);
	ushort __ovld __cnfn convert_ushort_sat_rtz(ushort);
	ushort __ovld __cnfn convert_ushort_rtp(ushort);
	ushort __ovld __cnfn convert_ushort_sat_rtp(ushort);
	ushort __ovld __cnfn convert_ushort_rtn(ushort);
	ushort __ovld __cnfn convert_ushort_sat_rtn(ushort);
	ushort __ovld __cnfn convert_ushort(ushort);
	ushort __ovld __cnfn convert_ushort_sat(ushort);
	ushort __ovld __cnfn convert_ushort_rte(int);
	ushort __ovld __cnfn convert_ushort_sat_rte(int);
	ushort __ovld __cnfn convert_ushort_rtz(int);
	ushort __ovld __cnfn convert_ushort_sat_rtz(int);
	ushort __ovld __cnfn convert_ushort_rtp(int);
	ushort __ovld __cnfn convert_ushort_sat_rtp(int);
	ushort __ovld __cnfn convert_ushort_rtn(int);
	ushort __ovld __cnfn convert_ushort_sat_rtn(int);
	ushort __ovld __cnfn convert_ushort(int);
	ushort __ovld __cnfn convert_ushort_sat(int);
	ushort __ovld __cnfn convert_ushort_rte(uint);
	ushort __ovld __cnfn convert_ushort_sat_rte(uint);
	ushort __ovld __cnfn convert_ushort_rtz(uint);
	ushort __ovld __cnfn convert_ushort_sat_rtz(uint);
	ushort __ovld __cnfn convert_ushort_rtp(uint);
	ushort __ovld __cnfn convert_ushort_sat_rtp(uint);
	ushort __ovld __cnfn convert_ushort_rtn(uint);
	ushort __ovld __cnfn convert_ushort_sat_rtn(uint);
	ushort __ovld __cnfn convert_ushort(uint);
	ushort __ovld __cnfn convert_ushort_sat(uint);
	ushort __ovld __cnfn convert_ushort_rte(long);
	ushort __ovld __cnfn convert_ushort_sat_rte(long);
	ushort __ovld __cnfn convert_ushort_rtz(long);
	ushort __ovld __cnfn convert_ushort_sat_rtz(long);
	ushort __ovld __cnfn convert_ushort_rtp(long);
	ushort __ovld __cnfn convert_ushort_sat_rtp(long);
	ushort __ovld __cnfn convert_ushort_rtn(long);
	ushort __ovld __cnfn convert_ushort_sat_rtn(long);
	ushort __ovld __cnfn convert_ushort(long);
	ushort __ovld __cnfn convert_ushort_sat(long);
	ushort __ovld __cnfn convert_ushort_rte(ulong);
	ushort __ovld __cnfn convert_ushort_sat_rte(ulong);
	ushort __ovld __cnfn convert_ushort_rtz(ulong);
	ushort __ovld __cnfn convert_ushort_sat_rtz(ulong);
	ushort __ovld __cnfn convert_ushort_rtp(ulong);
	ushort __ovld __cnfn convert_ushort_sat_rtp(ulong);
	ushort __ovld __cnfn convert_ushort_rtn(ulong);
	ushort __ovld __cnfn convert_ushort_sat_rtn(ulong);
	ushort __ovld __cnfn convert_ushort(ulong);
	ushort __ovld __cnfn convert_ushort_sat(ulong);
	ushort __ovld __cnfn convert_ushort_rte(float);
	ushort __ovld __cnfn convert_ushort_sat_rte(float);
	ushort __ovld __cnfn convert_ushort_rtz(float);
	ushort __ovld __cnfn convert_ushort_sat_rtz(float);
	ushort __ovld __cnfn convert_ushort_rtp(float);
	ushort __ovld __cnfn convert_ushort_sat_rtp(float);
	ushort __ovld __cnfn convert_ushort_rtn(float);
	ushort __ovld __cnfn convert_ushort_sat_rtn(float);
	ushort __ovld __cnfn convert_ushort(float);
	ushort __ovld __cnfn convert_ushort_sat(float);
	int __ovld __cnfn convert_int_rte(char);
	int __ovld __cnfn convert_int_sat_rte(char);
	int __ovld __cnfn convert_int_rtz(char);
	int __ovld __cnfn convert_int_sat_rtz(char);
	int __ovld __cnfn convert_int_rtp(char);
	int __ovld __cnfn convert_int_sat_rtp(char);
	int __ovld __cnfn convert_int_rtn(char);
	int __ovld __cnfn convert_int_sat_rtn(char);
	int __ovld __cnfn convert_int(char);
	int __ovld __cnfn convert_int_sat(char);
	int __ovld __cnfn convert_int_rte(uchar);
	int __ovld __cnfn convert_int_sat_rte(uchar);
	int __ovld __cnfn convert_int_rtz(uchar);
	int __ovld __cnfn convert_int_sat_rtz(uchar);
	int __ovld __cnfn convert_int_rtp(uchar);
	int __ovld __cnfn convert_int_sat_rtp(uchar);
	int __ovld __cnfn convert_int_rtn(uchar);
	int __ovld __cnfn convert_int_sat_rtn(uchar);
	int __ovld __cnfn convert_int(uchar);
	int __ovld __cnfn convert_int_sat(uchar);
	int __ovld __cnfn convert_int_rte(short);
	int __ovld __cnfn convert_int_sat_rte(short);
	int __ovld __cnfn convert_int_rtz(short);
	int __ovld __cnfn convert_int_sat_rtz(short);
	int __ovld __cnfn convert_int_rtp(short);
	int __ovld __cnfn convert_int_sat_rtp(short);
	int __ovld __cnfn convert_int_rtn(short);
	int __ovld __cnfn convert_int_sat_rtn(short);
	int __ovld __cnfn convert_int(short);
	int __ovld __cnfn convert_int_sat(short);
	int __ovld __cnfn convert_int_rte(ushort);
	int __ovld __cnfn convert_int_sat_rte(ushort);
	int __ovld __cnfn convert_int_rtz(ushort);
	int __ovld __cnfn convert_int_sat_rtz(ushort);
	int __ovld __cnfn convert_int_rtp(ushort);
	int __ovld __cnfn convert_int_sat_rtp(ushort);
	int __ovld __cnfn convert_int_rtn(ushort);
	int __ovld __cnfn convert_int_sat_rtn(ushort);
	int __ovld __cnfn convert_int(ushort);
	int __ovld __cnfn convert_int_sat(ushort);
	int __ovld __cnfn convert_int_rte(int);
	int __ovld __cnfn convert_int_sat_rte(int);
	int __ovld __cnfn convert_int_rtz(int);
	int __ovld __cnfn convert_int_sat_rtz(int);
	int __ovld __cnfn convert_int_rtp(int);
	int __ovld __cnfn convert_int_sat_rtp(int);
	int __ovld __cnfn convert_int_rtn(int);
	int __ovld __cnfn convert_int_sat_rtn(int);
	int __ovld __cnfn convert_int(int);
	int __ovld __cnfn convert_int_sat(int);
	int __ovld __cnfn convert_int_rte(uint);
	int __ovld __cnfn convert_int_sat_rte(uint);
	int __ovld __cnfn convert_int_rtz(uint);
	int __ovld __cnfn convert_int_sat_rtz(uint);
	int __ovld __cnfn convert_int_rtp(uint);
	int __ovld __cnfn convert_int_sat_rtp(uint);
	int __ovld __cnfn convert_int_rtn(uint);
	int __ovld __cnfn convert_int_sat_rtn(uint);
	int __ovld __cnfn convert_int(uint);
	int __ovld __cnfn convert_int_sat(uint);
	int __ovld __cnfn convert_int_rte(long);
	int __ovld __cnfn convert_int_sat_rte(long);
	int __ovld __cnfn convert_int_rtz(long);
	int __ovld __cnfn convert_int_sat_rtz(long);
	int __ovld __cnfn convert_int_rtp(long);
	int __ovld __cnfn convert_int_sat_rtp(long);
	int __ovld __cnfn convert_int_rtn(long);
	int __ovld __cnfn convert_int_sat_rtn(long);
	int __ovld __cnfn convert_int(long);
	int __ovld __cnfn convert_int_sat(long);
	int __ovld __cnfn convert_int_rte(ulong);
	int __ovld __cnfn convert_int_sat_rte(ulong);
	int __ovld __cnfn convert_int_rtz(ulong);
	int __ovld __cnfn convert_int_sat_rtz(ulong);
	int __ovld __cnfn convert_int_rtp(ulong);
	int __ovld __cnfn convert_int_sat_rtp(ulong);
	int __ovld __cnfn convert_int_rtn(ulong);
	int __ovld __cnfn convert_int_sat_rtn(ulong);
	int __ovld __cnfn convert_int(ulong);
	int __ovld __cnfn convert_int_sat(ulong);
	int __ovld __cnfn convert_int_rte(float);
	int __ovld __cnfn convert_int_sat_rte(float);
	int __ovld __cnfn convert_int_rtz(float);
	int __ovld __cnfn convert_int_sat_rtz(float);
	int __ovld __cnfn convert_int_rtp(float);
	int __ovld __cnfn convert_int_sat_rtp(float);
	int __ovld __cnfn convert_int_rtn(float);
	int __ovld __cnfn convert_int_sat_rtn(float);
	int __ovld __cnfn convert_int(float);
	int __ovld __cnfn convert_int_sat(float);
	uint __ovld __cnfn convert_uint_rte(char);
	uint __ovld __cnfn convert_uint_sat_rte(char);
	uint __ovld __cnfn convert_uint_rtz(char);
	uint __ovld __cnfn convert_uint_sat_rtz(char);
	uint __ovld __cnfn convert_uint_rtp(char);
	uint __ovld __cnfn convert_uint_sat_rtp(char);
	uint __ovld __cnfn convert_uint_rtn(char);
	uint __ovld __cnfn convert_uint_sat_rtn(char);
	uint __ovld __cnfn convert_uint(char);
	uint __ovld __cnfn convert_uint_sat(char);
	uint __ovld __cnfn convert_uint_rte(uchar);
	uint __ovld __cnfn convert_uint_sat_rte(uchar);
	uint __ovld __cnfn convert_uint_rtz(uchar);
	uint __ovld __cnfn convert_uint_sat_rtz(uchar);
	uint __ovld __cnfn convert_uint_rtp(uchar);
	uint __ovld __cnfn convert_uint_sat_rtp(uchar);
	uint __ovld __cnfn convert_uint_rtn(uchar);
	uint __ovld __cnfn convert_uint_sat_rtn(uchar);
	uint __ovld __cnfn convert_uint(uchar);
	uint __ovld __cnfn convert_uint_sat(uchar);
	uint __ovld __cnfn convert_uint_rte(short);
	uint __ovld __cnfn convert_uint_sat_rte(short);
	uint __ovld __cnfn convert_uint_rtz(short);
	uint __ovld __cnfn convert_uint_sat_rtz(short);
	uint __ovld __cnfn convert_uint_rtp(short);
	uint __ovld __cnfn convert_uint_sat_rtp(short);
	uint __ovld __cnfn convert_uint_rtn(short);
	uint __ovld __cnfn convert_uint_sat_rtn(short);
	uint __ovld __cnfn convert_uint(short);
	uint __ovld __cnfn convert_uint_sat(short);
	uint __ovld __cnfn convert_uint_rte(ushort);
	uint __ovld __cnfn convert_uint_sat_rte(ushort);
	uint __ovld __cnfn convert_uint_rtz(ushort);
	uint __ovld __cnfn convert_uint_sat_rtz(ushort);
	uint __ovld __cnfn convert_uint_rtp(ushort);
	uint __ovld __cnfn convert_uint_sat_rtp(ushort);
	uint __ovld __cnfn convert_uint_rtn(ushort);
	uint __ovld __cnfn convert_uint_sat_rtn(ushort);
	uint __ovld __cnfn convert_uint(ushort);
	uint __ovld __cnfn convert_uint_sat(ushort);
	uint __ovld __cnfn convert_uint_rte(int);
	uint __ovld __cnfn convert_uint_sat_rte(int);
	uint __ovld __cnfn convert_uint_rtz(int);
	uint __ovld __cnfn convert_uint_sat_rtz(int);
	uint __ovld __cnfn convert_uint_rtp(int);
	uint __ovld __cnfn convert_uint_sat_rtp(int);
	uint __ovld __cnfn convert_uint_rtn(int);
	uint __ovld __cnfn convert_uint_sat_rtn(int);
	uint __ovld __cnfn convert_uint(int);
	uint __ovld __cnfn convert_uint_sat(int);
	uint __ovld __cnfn convert_uint_rte(uint);
	uint __ovld __cnfn convert_uint_sat_rte(uint);
	uint __ovld __cnfn convert_uint_rtz(uint);
	uint __ovld __cnfn convert_uint_sat_rtz(uint);
	uint __ovld __cnfn convert_uint_rtp(uint);
	uint __ovld __cnfn convert_uint_sat_rtp(uint);
	uint __ovld __cnfn convert_uint_rtn(uint);
	uint __ovld __cnfn convert_uint_sat_rtn(uint);
	uint __ovld __cnfn convert_uint(uint);
	uint __ovld __cnfn convert_uint_sat(uint);
	uint __ovld __cnfn convert_uint_rte(long);
	uint __ovld __cnfn convert_uint_sat_rte(long);
	uint __ovld __cnfn convert_uint_rtz(long);
	uint __ovld __cnfn convert_uint_sat_rtz(long);
	uint __ovld __cnfn convert_uint_rtp(long);
	uint __ovld __cnfn convert_uint_sat_rtp(long);
	uint __ovld __cnfn convert_uint_rtn(long);
	uint __ovld __cnfn convert_uint_sat_rtn(long);
	uint __ovld __cnfn convert_uint(long);
	uint __ovld __cnfn convert_uint_sat(long);
	uint __ovld __cnfn convert_uint_rte(ulong);
	uint __ovld __cnfn convert_uint_sat_rte(ulong);
	uint __ovld __cnfn convert_uint_rtz(ulong);
	uint __ovld __cnfn convert_uint_sat_rtz(ulong);
	uint __ovld __cnfn convert_uint_rtp(ulong);
	uint __ovld __cnfn convert_uint_sat_rtp(ulong);
	uint __ovld __cnfn convert_uint_rtn(ulong);
	uint __ovld __cnfn convert_uint_sat_rtn(ulong);
	uint __ovld __cnfn convert_uint(ulong);
	uint __ovld __cnfn convert_uint_sat(ulong);
	uint __ovld __cnfn convert_uint_rte(float);
	uint __ovld __cnfn convert_uint_sat_rte(float);
	uint __ovld __cnfn convert_uint_rtz(float);
	uint __ovld __cnfn convert_uint_sat_rtz(float);
	uint __ovld __cnfn convert_uint_rtp(float);
	uint __ovld __cnfn convert_uint_sat_rtp(float);
	uint __ovld __cnfn convert_uint_rtn(float);
	uint __ovld __cnfn convert_uint_sat_rtn(float);
	uint __ovld __cnfn convert_uint(float);
	uint __ovld __cnfn convert_uint_sat(float);
	long __ovld __cnfn convert_long_rte(char);
	long __ovld __cnfn convert_long_sat_rte(char);
	long __ovld __cnfn convert_long_rtz(char);
	long __ovld __cnfn convert_long_sat_rtz(char);
	long __ovld __cnfn convert_long_rtp(char);
	long __ovld __cnfn convert_long_sat_rtp(char);
	long __ovld __cnfn convert_long_rtn(char);
	long __ovld __cnfn convert_long_sat_rtn(char);
	long __ovld __cnfn convert_long(char);
	long __ovld __cnfn convert_long_sat(char);
	long __ovld __cnfn convert_long_rte(uchar);
	long __ovld __cnfn convert_long_sat_rte(uchar);
	long __ovld __cnfn convert_long_rtz(uchar);
	long __ovld __cnfn convert_long_sat_rtz(uchar);
	long __ovld __cnfn convert_long_rtp(uchar);
	long __ovld __cnfn convert_long_sat_rtp(uchar);
	long __ovld __cnfn convert_long_rtn(uchar);
	long __ovld __cnfn convert_long_sat_rtn(uchar);
	long __ovld __cnfn convert_long(uchar);
	long __ovld __cnfn convert_long_sat(uchar);
	long __ovld __cnfn convert_long_rte(short);
	long __ovld __cnfn convert_long_sat_rte(short);
	long __ovld __cnfn convert_long_rtz(short);
	long __ovld __cnfn convert_long_sat_rtz(short);
	long __ovld __cnfn convert_long_rtp(short);
	long __ovld __cnfn convert_long_sat_rtp(short);
	long __ovld __cnfn convert_long_rtn(short);
	long __ovld __cnfn convert_long_sat_rtn(short);
	long __ovld __cnfn convert_long(short);
	long __ovld __cnfn convert_long_sat(short);
	long __ovld __cnfn convert_long_rte(ushort);
	long __ovld __cnfn convert_long_sat_rte(ushort);
	long __ovld __cnfn convert_long_rtz(ushort);
	long __ovld __cnfn convert_long_sat_rtz(ushort);
	long __ovld __cnfn convert_long_rtp(ushort);
	long __ovld __cnfn convert_long_sat_rtp(ushort);
	long __ovld __cnfn convert_long_rtn(ushort);
	long __ovld __cnfn convert_long_sat_rtn(ushort);
	long __ovld __cnfn convert_long(ushort);
	long __ovld __cnfn convert_long_sat(ushort);
	long __ovld __cnfn convert_long_rte(int);
	long __ovld __cnfn convert_long_sat_rte(int);
	long __ovld __cnfn convert_long_rtz(int);
	long __ovld __cnfn convert_long_sat_rtz(int);
	long __ovld __cnfn convert_long_rtp(int);
	long __ovld __cnfn convert_long_sat_rtp(int);
	long __ovld __cnfn convert_long_rtn(int);
	long __ovld __cnfn convert_long_sat_rtn(int);
	long __ovld __cnfn convert_long(int);
	long __ovld __cnfn convert_long_sat(int);
	long __ovld __cnfn convert_long_rte(uint);
	long __ovld __cnfn convert_long_sat_rte(uint);
	long __ovld __cnfn convert_long_rtz(uint);
	long __ovld __cnfn convert_long_sat_rtz(uint);
	long __ovld __cnfn convert_long_rtp(uint);
	long __ovld __cnfn convert_long_sat_rtp(uint);
	long __ovld __cnfn convert_long_rtn(uint);
	long __ovld __cnfn convert_long_sat_rtn(uint);
	long __ovld __cnfn convert_long(uint);
	long __ovld __cnfn convert_long_sat(uint);
	long __ovld __cnfn convert_long_rte(long);
	long __ovld __cnfn convert_long_sat_rte(long);
	long __ovld __cnfn convert_long_rtz(long);
	long __ovld __cnfn convert_long_sat_rtz(long);
	long __ovld __cnfn convert_long_rtp(long);
	long __ovld __cnfn convert_long_sat_rtp(long);
	long __ovld __cnfn convert_long_rtn(long);
	long __ovld __cnfn convert_long_sat_rtn(long);
	long __ovld __cnfn convert_long(long);
	long __ovld __cnfn convert_long_sat(long);
	long __ovld __cnfn convert_long_rte(ulong);
	long __ovld __cnfn convert_long_sat_rte(ulong);
	long __ovld __cnfn convert_long_rtz(ulong);
	long __ovld __cnfn convert_long_sat_rtz(ulong);
	long __ovld __cnfn convert_long_rtp(ulong);
	long __ovld __cnfn convert_long_sat_rtp(ulong);
	long __ovld __cnfn convert_long_rtn(ulong);
	long __ovld __cnfn convert_long_sat_rtn(ulong);
	long __ovld __cnfn convert_long(ulong);
	long __ovld __cnfn convert_long_sat(ulong);
	long __ovld __cnfn convert_long_rte(float);
	long __ovld __cnfn convert_long_sat_rte(float);
	long __ovld __cnfn convert_long_rtz(float);
	long __ovld __cnfn convert_long_sat_rtz(float);
	long __ovld __cnfn convert_long_rtp(float);
	long __ovld __cnfn convert_long_sat_rtp(float);
	long __ovld __cnfn convert_long_rtn(float);
	long __ovld __cnfn convert_long_sat_rtn(float);
	long __ovld __cnfn convert_long(float);
	long __ovld __cnfn convert_long_sat(float);
	ulong __ovld __cnfn convert_ulong_rte(char);
	ulong __ovld __cnfn convert_ulong_sat_rte(char);
	ulong __ovld __cnfn convert_ulong_rtz(char);
	ulong __ovld __cnfn convert_ulong_sat_rtz(char);
	ulong __ovld __cnfn convert_ulong_rtp(char);
	ulong __ovld __cnfn convert_ulong_sat_rtp(char);
	ulong __ovld __cnfn convert_ulong_rtn(char);
	ulong __ovld __cnfn convert_ulong_sat_rtn(char);
	ulong __ovld __cnfn convert_ulong(char);
	ulong __ovld __cnfn convert_ulong_sat(char);
	ulong __ovld __cnfn convert_ulong_rte(uchar);
	ulong __ovld __cnfn convert_ulong_sat_rte(uchar);
	ulong __ovld __cnfn convert_ulong_rtz(uchar);
	ulong __ovld __cnfn convert_ulong_sat_rtz(uchar);
	ulong __ovld __cnfn convert_ulong_rtp(uchar);
	ulong __ovld __cnfn convert_ulong_sat_rtp(uchar);
	ulong __ovld __cnfn convert_ulong_rtn(uchar);
	ulong __ovld __cnfn convert_ulong_sat_rtn(uchar);
	ulong __ovld __cnfn convert_ulong(uchar);
	ulong __ovld __cnfn convert_ulong_sat(uchar);
	ulong __ovld __cnfn convert_ulong_rte(short);
	ulong __ovld __cnfn convert_ulong_sat_rte(short);
	ulong __ovld __cnfn convert_ulong_rtz(short);
	ulong __ovld __cnfn convert_ulong_sat_rtz(short);
	ulong __ovld __cnfn convert_ulong_rtp(short);
	ulong __ovld __cnfn convert_ulong_sat_rtp(short);
	ulong __ovld __cnfn convert_ulong_rtn(short);
	ulong __ovld __cnfn convert_ulong_sat_rtn(short);
	ulong __ovld __cnfn convert_ulong(short);
	ulong __ovld __cnfn convert_ulong_sat(short);
	ulong __ovld __cnfn convert_ulong_rte(ushort);
	ulong __ovld __cnfn convert_ulong_sat_rte(ushort);
	ulong __ovld __cnfn convert_ulong_rtz(ushort);
	ulong __ovld __cnfn convert_ulong_sat_rtz(ushort);
	ulong __ovld __cnfn convert_ulong_rtp(ushort);
	ulong __ovld __cnfn convert_ulong_sat_rtp(ushort);
	ulong __ovld __cnfn convert_ulong_rtn(ushort);
	ulong __ovld __cnfn convert_ulong_sat_rtn(ushort);
	ulong __ovld __cnfn convert_ulong(ushort);
	ulong __ovld __cnfn convert_ulong_sat(ushort);
	ulong __ovld __cnfn convert_ulong_rte(int);
	ulong __ovld __cnfn convert_ulong_sat_rte(int);
	ulong __ovld __cnfn convert_ulong_rtz(int);
	ulong __ovld __cnfn convert_ulong_sat_rtz(int);
	ulong __ovld __cnfn convert_ulong_rtp(int);
	ulong __ovld __cnfn convert_ulong_sat_rtp(int);
	ulong __ovld __cnfn convert_ulong_rtn(int);
	ulong __ovld __cnfn convert_ulong_sat_rtn(int);
	ulong __ovld __cnfn convert_ulong(int);
	ulong __ovld __cnfn convert_ulong_sat(int);
	ulong __ovld __cnfn convert_ulong_rte(uint);
	ulong __ovld __cnfn convert_ulong_sat_rte(uint);
	ulong __ovld __cnfn convert_ulong_rtz(uint);
	ulong __ovld __cnfn convert_ulong_sat_rtz(uint);
	ulong __ovld __cnfn convert_ulong_rtp(uint);
	ulong __ovld __cnfn convert_ulong_sat_rtp(uint);
	ulong __ovld __cnfn convert_ulong_rtn(uint);
	ulong __ovld __cnfn convert_ulong_sat_rtn(uint);
	ulong __ovld __cnfn convert_ulong(uint);
	ulong __ovld __cnfn convert_ulong_sat(uint);
	ulong __ovld __cnfn convert_ulong_rte(long);
	ulong __ovld __cnfn convert_ulong_sat_rte(long);
	ulong __ovld __cnfn convert_ulong_rtz(long);
	ulong __ovld __cnfn convert_ulong_sat_rtz(long);
	ulong __ovld __cnfn convert_ulong_rtp(long);
	ulong __ovld __cnfn convert_ulong_sat_rtp(long);
	ulong __ovld __cnfn convert_ulong_rtn(long);
	ulong __ovld __cnfn convert_ulong_sat_rtn(long);
	ulong __ovld __cnfn convert_ulong(long);
	ulong __ovld __cnfn convert_ulong_sat(long);
	ulong __ovld __cnfn convert_ulong_rte(ulong);
	ulong __ovld __cnfn convert_ulong_sat_rte(ulong);
	ulong __ovld __cnfn convert_ulong_rtz(ulong);
	ulong __ovld __cnfn convert_ulong_sat_rtz(ulong);
	ulong __ovld __cnfn convert_ulong_rtp(ulong);
	ulong __ovld __cnfn convert_ulong_sat_rtp(ulong);
	ulong __ovld __cnfn convert_ulong_rtn(ulong);
	ulong __ovld __cnfn convert_ulong_sat_rtn(ulong);
	ulong __ovld __cnfn convert_ulong(ulong);
	ulong __ovld __cnfn convert_ulong_sat(ulong);
	ulong __ovld __cnfn convert_ulong_rte(float);
	ulong __ovld __cnfn convert_ulong_sat_rte(float);
	ulong __ovld __cnfn convert_ulong_rtz(float);
	ulong __ovld __cnfn convert_ulong_sat_rtz(float);
	ulong __ovld __cnfn convert_ulong_rtp(float);
	ulong __ovld __cnfn convert_ulong_sat_rtp(float);
	ulong __ovld __cnfn convert_ulong_rtn(float);
	ulong __ovld __cnfn convert_ulong_sat_rtn(float);
	ulong __ovld __cnfn convert_ulong(float);
	ulong __ovld __cnfn convert_ulong_sat(float);
	float __ovld __cnfn convert_float_rte(char);
	float __ovld __cnfn convert_float_rtz(char);
	float __ovld __cnfn convert_float_rtp(char);
	float __ovld __cnfn convert_float_rtn(char);
	float __ovld __cnfn convert_float(char);
	float __ovld __cnfn convert_float_rte(uchar);
	float __ovld __cnfn convert_float_rtz(uchar);
	float __ovld __cnfn convert_float_rtp(uchar);
	float __ovld __cnfn convert_float_rtn(uchar);
	float __ovld __cnfn convert_float(uchar);
	float __ovld __cnfn convert_float_rte(short);
	float __ovld __cnfn convert_float_rtz(short);
	float __ovld __cnfn convert_float_rtp(short);
	float __ovld __cnfn convert_float_rtn(short);
	float __ovld __cnfn convert_float(short);
	float __ovld __cnfn convert_float_rte(ushort);
	float __ovld __cnfn convert_float_rtz(ushort);
	float __ovld __cnfn convert_float_rtp(ushort);
	float __ovld __cnfn convert_float_rtn(ushort);
	float __ovld __cnfn convert_float(ushort);
	float __ovld __cnfn convert_float_rte(int);
	float __ovld __cnfn convert_float_rtz(int);
	float __ovld __cnfn convert_float_rtp(int);
	float __ovld __cnfn convert_float_rtn(int);
	float __ovld __cnfn convert_float(int);
	float __ovld __cnfn convert_float_rte(uint);
	float __ovld __cnfn convert_float_rtz(uint);
	float __ovld __cnfn convert_float_rtp(uint);
	float __ovld __cnfn convert_float_rtn(uint);
	float __ovld __cnfn convert_float(uint);
	float __ovld __cnfn convert_float_rte(long);
	float __ovld __cnfn convert_float_rtz(long);
	float __ovld __cnfn convert_float_rtp(long);
	float __ovld __cnfn convert_float_rtn(long);
	float __ovld __cnfn convert_float(long);
	float __ovld __cnfn convert_float_rte(ulong);
	float __ovld __cnfn convert_float_rtz(ulong);
	float __ovld __cnfn convert_float_rtp(ulong);
	float __ovld __cnfn convert_float_rtn(ulong);
	float __ovld __cnfn convert_float(ulong);
	float __ovld __cnfn convert_float_rte(float);
	float __ovld __cnfn convert_float_rtz(float);
	float __ovld __cnfn convert_float_rtp(float);
	float __ovld __cnfn convert_float_rtn(float);
	float __ovld __cnfn convert_float(float);
	char2 __ovld __cnfn convert_char2_rte(char2);
	char2 __ovld __cnfn convert_char2_sat_rte(char2);
	char2 __ovld __cnfn convert_char2_rtz(char2);
	char2 __ovld __cnfn convert_char2_sat_rtz(char2);
	char2 __ovld __cnfn convert_char2_rtp(char2);
	char2 __ovld __cnfn convert_char2_sat_rtp(char2);
	char2 __ovld __cnfn convert_char2_rtn(char2);
	char2 __ovld __cnfn convert_char2_sat_rtn(char2);
	char2 __ovld __cnfn convert_char2(char2);
	char2 __ovld __cnfn convert_char2_sat(char2);
	char2 __ovld __cnfn convert_char2_rte(uchar2);
	char2 __ovld __cnfn convert_char2_sat_rte(uchar2);
	char2 __ovld __cnfn convert_char2_rtz(uchar2);
	char2 __ovld __cnfn convert_char2_sat_rtz(uchar2);
	char2 __ovld __cnfn convert_char2_rtp(uchar2);
	char2 __ovld __cnfn convert_char2_sat_rtp(uchar2);
	char2 __ovld __cnfn convert_char2_rtn(uchar2);
	char2 __ovld __cnfn convert_char2_sat_rtn(uchar2);
	char2 __ovld __cnfn convert_char2(uchar2);
	char2 __ovld __cnfn convert_char2_sat(uchar2);
	char2 __ovld __cnfn convert_char2_rte(short2);
	char2 __ovld __cnfn convert_char2_sat_rte(short2);
	char2 __ovld __cnfn convert_char2_rtz(short2);
	char2 __ovld __cnfn convert_char2_sat_rtz(short2);
	char2 __ovld __cnfn convert_char2_rtp(short2);
	char2 __ovld __cnfn convert_char2_sat_rtp(short2);
	char2 __ovld __cnfn convert_char2_rtn(short2);
	char2 __ovld __cnfn convert_char2_sat_rtn(short2);
	char2 __ovld __cnfn convert_char2(short2);
	char2 __ovld __cnfn convert_char2_sat(short2);
	char2 __ovld __cnfn convert_char2_rte(ushort2);
	char2 __ovld __cnfn convert_char2_sat_rte(ushort2);
	char2 __ovld __cnfn convert_char2_rtz(ushort2);
	char2 __ovld __cnfn convert_char2_sat_rtz(ushort2);
	char2 __ovld __cnfn convert_char2_rtp(ushort2);
	char2 __ovld __cnfn convert_char2_sat_rtp(ushort2);
	char2 __ovld __cnfn convert_char2_rtn(ushort2);
	char2 __ovld __cnfn convert_char2_sat_rtn(ushort2);
	char2 __ovld __cnfn convert_char2(ushort2);
	char2 __ovld __cnfn convert_char2_sat(ushort2);
	char2 __ovld __cnfn convert_char2_rte(int2);
	char2 __ovld __cnfn convert_char2_sat_rte(int2);
	char2 __ovld __cnfn convert_char2_rtz(int2);
	char2 __ovld __cnfn convert_char2_sat_rtz(int2);
	char2 __ovld __cnfn convert_char2_rtp(int2);
	char2 __ovld __cnfn convert_char2_sat_rtp(int2);
	char2 __ovld __cnfn convert_char2_rtn(int2);
	char2 __ovld __cnfn convert_char2_sat_rtn(int2);
	char2 __ovld __cnfn convert_char2(int2);
	char2 __ovld __cnfn convert_char2_sat(int2);
	char2 __ovld __cnfn convert_char2_rte(uint2);
	char2 __ovld __cnfn convert_char2_sat_rte(uint2);
	char2 __ovld __cnfn convert_char2_rtz(uint2);
	char2 __ovld __cnfn convert_char2_sat_rtz(uint2);
	char2 __ovld __cnfn convert_char2_rtp(uint2);
	char2 __ovld __cnfn convert_char2_sat_rtp(uint2);
	char2 __ovld __cnfn convert_char2_rtn(uint2);
	char2 __ovld __cnfn convert_char2_sat_rtn(uint2);
	char2 __ovld __cnfn convert_char2(uint2);
	char2 __ovld __cnfn convert_char2_sat(uint2);
	char2 __ovld __cnfn convert_char2_rte(long2);
	char2 __ovld __cnfn convert_char2_sat_rte(long2);
	char2 __ovld __cnfn convert_char2_rtz(long2);
	char2 __ovld __cnfn convert_char2_sat_rtz(long2);
	char2 __ovld __cnfn convert_char2_rtp(long2);
	char2 __ovld __cnfn convert_char2_sat_rtp(long2);
	char2 __ovld __cnfn convert_char2_rtn(long2);
	char2 __ovld __cnfn convert_char2_sat_rtn(long2);
	char2 __ovld __cnfn convert_char2(long2);
	char2 __ovld __cnfn convert_char2_sat(long2);
	char2 __ovld __cnfn convert_char2_rte(ulong2);
	char2 __ovld __cnfn convert_char2_sat_rte(ulong2);
	char2 __ovld __cnfn convert_char2_rtz(ulong2);
	char2 __ovld __cnfn convert_char2_sat_rtz(ulong2);
	char2 __ovld __cnfn convert_char2_rtp(ulong2);
	char2 __ovld __cnfn convert_char2_sat_rtp(ulong2);
	char2 __ovld __cnfn convert_char2_rtn(ulong2);
	char2 __ovld __cnfn convert_char2_sat_rtn(ulong2);
	char2 __ovld __cnfn convert_char2(ulong2);
	char2 __ovld __cnfn convert_char2_sat(ulong2);
	char2 __ovld __cnfn convert_char2_rte(float2);
	char2 __ovld __cnfn convert_char2_sat_rte(float2);
	char2 __ovld __cnfn convert_char2_rtz(float2);
	char2 __ovld __cnfn convert_char2_sat_rtz(float2);
	char2 __ovld __cnfn convert_char2_rtp(float2);
	char2 __ovld __cnfn convert_char2_sat_rtp(float2);
	char2 __ovld __cnfn convert_char2_rtn(float2);
	char2 __ovld __cnfn convert_char2_sat_rtn(float2);
	char2 __ovld __cnfn convert_char2(float2);
	char2 __ovld __cnfn convert_char2_sat(float2);
	uchar2 __ovld __cnfn convert_uchar2_rte(char2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(char2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(char2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(char2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(char2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(char2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(char2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(char2);
	uchar2 __ovld __cnfn convert_uchar2(char2);
	uchar2 __ovld __cnfn convert_uchar2_sat(char2);
	uchar2 __ovld __cnfn convert_uchar2_rte(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(uchar2);
	uchar2 __ovld __cnfn convert_uchar2(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_sat(uchar2);
	uchar2 __ovld __cnfn convert_uchar2_rte(short2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(short2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(short2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(short2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(short2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(short2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(short2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(short2);
	uchar2 __ovld __cnfn convert_uchar2(short2);
	uchar2 __ovld __cnfn convert_uchar2_sat(short2);
	uchar2 __ovld __cnfn convert_uchar2_rte(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(ushort2);
	uchar2 __ovld __cnfn convert_uchar2(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_sat(ushort2);
	uchar2 __ovld __cnfn convert_uchar2_rte(int2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(int2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(int2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(int2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(int2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(int2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(int2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(int2);
	uchar2 __ovld __cnfn convert_uchar2(int2);
	uchar2 __ovld __cnfn convert_uchar2_sat(int2);
	uchar2 __ovld __cnfn convert_uchar2_rte(uint2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(uint2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(uint2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(uint2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(uint2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(uint2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(uint2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(uint2);
	uchar2 __ovld __cnfn convert_uchar2(uint2);
	uchar2 __ovld __cnfn convert_uchar2_sat(uint2);
	uchar2 __ovld __cnfn convert_uchar2_rte(long2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(long2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(long2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(long2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(long2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(long2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(long2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(long2);
	uchar2 __ovld __cnfn convert_uchar2(long2);
	uchar2 __ovld __cnfn convert_uchar2_sat(long2);
	uchar2 __ovld __cnfn convert_uchar2_rte(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(ulong2);
	uchar2 __ovld __cnfn convert_uchar2(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_sat(ulong2);
	uchar2 __ovld __cnfn convert_uchar2_rte(float2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(float2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(float2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(float2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(float2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(float2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(float2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(float2);
	uchar2 __ovld __cnfn convert_uchar2(float2);
	uchar2 __ovld __cnfn convert_uchar2_sat(float2);
	short2 __ovld __cnfn convert_short2_rte(char2);
	short2 __ovld __cnfn convert_short2_sat_rte(char2);
	short2 __ovld __cnfn convert_short2_rtz(char2);
	short2 __ovld __cnfn convert_short2_sat_rtz(char2);
	short2 __ovld __cnfn convert_short2_rtp(char2);
	short2 __ovld __cnfn convert_short2_sat_rtp(char2);
	short2 __ovld __cnfn convert_short2_rtn(char2);
	short2 __ovld __cnfn convert_short2_sat_rtn(char2);
	short2 __ovld __cnfn convert_short2(char2);
	short2 __ovld __cnfn convert_short2_sat(char2);
	short2 __ovld __cnfn convert_short2_rte(uchar2);
	short2 __ovld __cnfn convert_short2_sat_rte(uchar2);
	short2 __ovld __cnfn convert_short2_rtz(uchar2);
	short2 __ovld __cnfn convert_short2_sat_rtz(uchar2);
	short2 __ovld __cnfn convert_short2_rtp(uchar2);
	short2 __ovld __cnfn convert_short2_sat_rtp(uchar2);
	short2 __ovld __cnfn convert_short2_rtn(uchar2);
	short2 __ovld __cnfn convert_short2_sat_rtn(uchar2);
	short2 __ovld __cnfn convert_short2(uchar2);
	short2 __ovld __cnfn convert_short2_sat(uchar2);
	short2 __ovld __cnfn convert_short2_rte(short2);
	short2 __ovld __cnfn convert_short2_sat_rte(short2);
	short2 __ovld __cnfn convert_short2_rtz(short2);
	short2 __ovld __cnfn convert_short2_sat_rtz(short2);
	short2 __ovld __cnfn convert_short2_rtp(short2);
	short2 __ovld __cnfn convert_short2_sat_rtp(short2);
	short2 __ovld __cnfn convert_short2_rtn(short2);
	short2 __ovld __cnfn convert_short2_sat_rtn(short2);
	short2 __ovld __cnfn convert_short2(short2);
	short2 __ovld __cnfn convert_short2_sat(short2);
	short2 __ovld __cnfn convert_short2_rte(ushort2);
	short2 __ovld __cnfn convert_short2_sat_rte(ushort2);
	short2 __ovld __cnfn convert_short2_rtz(ushort2);
	short2 __ovld __cnfn convert_short2_sat_rtz(ushort2);
	short2 __ovld __cnfn convert_short2_rtp(ushort2);
	short2 __ovld __cnfn convert_short2_sat_rtp(ushort2);
	short2 __ovld __cnfn convert_short2_rtn(ushort2);
	short2 __ovld __cnfn convert_short2_sat_rtn(ushort2);
	short2 __ovld __cnfn convert_short2(ushort2);
	short2 __ovld __cnfn convert_short2_sat(ushort2);
	short2 __ovld __cnfn convert_short2_rte(int2);
	short2 __ovld __cnfn convert_short2_sat_rte(int2);
	short2 __ovld __cnfn convert_short2_rtz(int2);
	short2 __ovld __cnfn convert_short2_sat_rtz(int2);
	short2 __ovld __cnfn convert_short2_rtp(int2);
	short2 __ovld __cnfn convert_short2_sat_rtp(int2);
	short2 __ovld __cnfn convert_short2_rtn(int2);
	short2 __ovld __cnfn convert_short2_sat_rtn(int2);
	short2 __ovld __cnfn convert_short2(int2);
	short2 __ovld __cnfn convert_short2_sat(int2);
	short2 __ovld __cnfn convert_short2_rte(uint2);
	short2 __ovld __cnfn convert_short2_sat_rte(uint2);
	short2 __ovld __cnfn convert_short2_rtz(uint2);
	short2 __ovld __cnfn convert_short2_sat_rtz(uint2);
	short2 __ovld __cnfn convert_short2_rtp(uint2);
	short2 __ovld __cnfn convert_short2_sat_rtp(uint2);
	short2 __ovld __cnfn convert_short2_rtn(uint2);
	short2 __ovld __cnfn convert_short2_sat_rtn(uint2);
	short2 __ovld __cnfn convert_short2(uint2);
	short2 __ovld __cnfn convert_short2_sat(uint2);
	short2 __ovld __cnfn convert_short2_rte(long2);
	short2 __ovld __cnfn convert_short2_sat_rte(long2);
	short2 __ovld __cnfn convert_short2_rtz(long2);
	short2 __ovld __cnfn convert_short2_sat_rtz(long2);
	short2 __ovld __cnfn convert_short2_rtp(long2);
	short2 __ovld __cnfn convert_short2_sat_rtp(long2);
	short2 __ovld __cnfn convert_short2_rtn(long2);
	short2 __ovld __cnfn convert_short2_sat_rtn(long2);
	short2 __ovld __cnfn convert_short2(long2);
	short2 __ovld __cnfn convert_short2_sat(long2);
	short2 __ovld __cnfn convert_short2_rte(ulong2);
	short2 __ovld __cnfn convert_short2_sat_rte(ulong2);
	short2 __ovld __cnfn convert_short2_rtz(ulong2);
	short2 __ovld __cnfn convert_short2_sat_rtz(ulong2);
	short2 __ovld __cnfn convert_short2_rtp(ulong2);
	short2 __ovld __cnfn convert_short2_sat_rtp(ulong2);
	short2 __ovld __cnfn convert_short2_rtn(ulong2);
	short2 __ovld __cnfn convert_short2_sat_rtn(ulong2);
	short2 __ovld __cnfn convert_short2(ulong2);
	short2 __ovld __cnfn convert_short2_sat(ulong2);
	short2 __ovld __cnfn convert_short2_rte(float2);
	short2 __ovld __cnfn convert_short2_sat_rte(float2);
	short2 __ovld __cnfn convert_short2_rtz(float2);
	short2 __ovld __cnfn convert_short2_sat_rtz(float2);
	short2 __ovld __cnfn convert_short2_rtp(float2);
	short2 __ovld __cnfn convert_short2_sat_rtp(float2);
	short2 __ovld __cnfn convert_short2_rtn(float2);
	short2 __ovld __cnfn convert_short2_sat_rtn(float2);
	short2 __ovld __cnfn convert_short2(float2);
	short2 __ovld __cnfn convert_short2_sat(float2);
	ushort2 __ovld __cnfn convert_ushort2_rte(char2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(char2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(char2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(char2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(char2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(char2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(char2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(char2);
	ushort2 __ovld __cnfn convert_ushort2(char2);
	ushort2 __ovld __cnfn convert_ushort2_sat(char2);
	ushort2 __ovld __cnfn convert_ushort2_rte(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(uchar2);
	ushort2 __ovld __cnfn convert_ushort2(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_sat(uchar2);
	ushort2 __ovld __cnfn convert_ushort2_rte(short2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(short2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(short2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(short2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(short2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(short2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(short2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(short2);
	ushort2 __ovld __cnfn convert_ushort2(short2);
	ushort2 __ovld __cnfn convert_ushort2_sat(short2);
	ushort2 __ovld __cnfn convert_ushort2_rte(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(ushort2);
	ushort2 __ovld __cnfn convert_ushort2(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_sat(ushort2);
	ushort2 __ovld __cnfn convert_ushort2_rte(int2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(int2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(int2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(int2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(int2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(int2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(int2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(int2);
	ushort2 __ovld __cnfn convert_ushort2(int2);
	ushort2 __ovld __cnfn convert_ushort2_sat(int2);
	ushort2 __ovld __cnfn convert_ushort2_rte(uint2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(uint2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(uint2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(uint2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(uint2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(uint2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(uint2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(uint2);
	ushort2 __ovld __cnfn convert_ushort2(uint2);
	ushort2 __ovld __cnfn convert_ushort2_sat(uint2);
	ushort2 __ovld __cnfn convert_ushort2_rte(long2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(long2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(long2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(long2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(long2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(long2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(long2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(long2);
	ushort2 __ovld __cnfn convert_ushort2(long2);
	ushort2 __ovld __cnfn convert_ushort2_sat(long2);
	ushort2 __ovld __cnfn convert_ushort2_rte(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(ulong2);
	ushort2 __ovld __cnfn convert_ushort2(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_sat(ulong2);
	ushort2 __ovld __cnfn convert_ushort2_rte(float2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(float2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(float2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(float2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(float2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(float2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(float2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(float2);
	ushort2 __ovld __cnfn convert_ushort2(float2);
	ushort2 __ovld __cnfn convert_ushort2_sat(float2);
	int2 __ovld __cnfn convert_int2_rte(char2);
	int2 __ovld __cnfn convert_int2_sat_rte(char2);
	int2 __ovld __cnfn convert_int2_rtz(char2);
	int2 __ovld __cnfn convert_int2_sat_rtz(char2);
	int2 __ovld __cnfn convert_int2_rtp(char2);
	int2 __ovld __cnfn convert_int2_sat_rtp(char2);
	int2 __ovld __cnfn convert_int2_rtn(char2);
	int2 __ovld __cnfn convert_int2_sat_rtn(char2);
	int2 __ovld __cnfn convert_int2(char2);
	int2 __ovld __cnfn convert_int2_sat(char2);
	int2 __ovld __cnfn convert_int2_rte(uchar2);
	int2 __ovld __cnfn convert_int2_sat_rte(uchar2);
	int2 __ovld __cnfn convert_int2_rtz(uchar2);
	int2 __ovld __cnfn convert_int2_sat_rtz(uchar2);
	int2 __ovld __cnfn convert_int2_rtp(uchar2);
	int2 __ovld __cnfn convert_int2_sat_rtp(uchar2);
	int2 __ovld __cnfn convert_int2_rtn(uchar2);
	int2 __ovld __cnfn convert_int2_sat_rtn(uchar2);
	int2 __ovld __cnfn convert_int2(uchar2);
	int2 __ovld __cnfn convert_int2_sat(uchar2);
	int2 __ovld __cnfn convert_int2_rte(short2);
	int2 __ovld __cnfn convert_int2_sat_rte(short2);
	int2 __ovld __cnfn convert_int2_rtz(short2);
	int2 __ovld __cnfn convert_int2_sat_rtz(short2);
	int2 __ovld __cnfn convert_int2_rtp(short2);
	int2 __ovld __cnfn convert_int2_sat_rtp(short2);
	int2 __ovld __cnfn convert_int2_rtn(short2);
	int2 __ovld __cnfn convert_int2_sat_rtn(short2);
	int2 __ovld __cnfn convert_int2(short2);
	int2 __ovld __cnfn convert_int2_sat(short2);
	int2 __ovld __cnfn convert_int2_rte(ushort2);
	int2 __ovld __cnfn convert_int2_sat_rte(ushort2);
	int2 __ovld __cnfn convert_int2_rtz(ushort2);
	int2 __ovld __cnfn convert_int2_sat_rtz(ushort2);
	int2 __ovld __cnfn convert_int2_rtp(ushort2);
	int2 __ovld __cnfn convert_int2_sat_rtp(ushort2);
	int2 __ovld __cnfn convert_int2_rtn(ushort2);
	int2 __ovld __cnfn convert_int2_sat_rtn(ushort2);
	int2 __ovld __cnfn convert_int2(ushort2);
	int2 __ovld __cnfn convert_int2_sat(ushort2);
	int2 __ovld __cnfn convert_int2_rte(int2);
	int2 __ovld __cnfn convert_int2_sat_rte(int2);
	int2 __ovld __cnfn convert_int2_rtz(int2);
	int2 __ovld __cnfn convert_int2_sat_rtz(int2);
	int2 __ovld __cnfn convert_int2_rtp(int2);
	int2 __ovld __cnfn convert_int2_sat_rtp(int2);
	int2 __ovld __cnfn convert_int2_rtn(int2);
	int2 __ovld __cnfn convert_int2_sat_rtn(int2);
	int2 __ovld __cnfn convert_int2(int2);
	int2 __ovld __cnfn convert_int2_sat(int2);
	int2 __ovld __cnfn convert_int2_rte(uint2);
	int2 __ovld __cnfn convert_int2_sat_rte(uint2);
	int2 __ovld __cnfn convert_int2_rtz(uint2);
	int2 __ovld __cnfn convert_int2_sat_rtz(uint2);
	int2 __ovld __cnfn convert_int2_rtp(uint2);
	int2 __ovld __cnfn convert_int2_sat_rtp(uint2);
	int2 __ovld __cnfn convert_int2_rtn(uint2);
	int2 __ovld __cnfn convert_int2_sat_rtn(uint2);
	int2 __ovld __cnfn convert_int2(uint2);
	int2 __ovld __cnfn convert_int2_sat(uint2);
	int2 __ovld __cnfn convert_int2_rte(long2);
	int2 __ovld __cnfn convert_int2_sat_rte(long2);
	int2 __ovld __cnfn convert_int2_rtz(long2);
	int2 __ovld __cnfn convert_int2_sat_rtz(long2);
	int2 __ovld __cnfn convert_int2_rtp(long2);
	int2 __ovld __cnfn convert_int2_sat_rtp(long2);
	int2 __ovld __cnfn convert_int2_rtn(long2);
	int2 __ovld __cnfn convert_int2_sat_rtn(long2);
	int2 __ovld __cnfn convert_int2(long2);
	int2 __ovld __cnfn convert_int2_sat(long2);
	int2 __ovld __cnfn convert_int2_rte(ulong2);
	int2 __ovld __cnfn convert_int2_sat_rte(ulong2);
	int2 __ovld __cnfn convert_int2_rtz(ulong2);
	int2 __ovld __cnfn convert_int2_sat_rtz(ulong2);
	int2 __ovld __cnfn convert_int2_rtp(ulong2);
	int2 __ovld __cnfn convert_int2_sat_rtp(ulong2);
	int2 __ovld __cnfn convert_int2_rtn(ulong2);
	int2 __ovld __cnfn convert_int2_sat_rtn(ulong2);
	int2 __ovld __cnfn convert_int2(ulong2);
	int2 __ovld __cnfn convert_int2_sat(ulong2);
	int2 __ovld __cnfn convert_int2_rte(float2);
	int2 __ovld __cnfn convert_int2_sat_rte(float2);
	int2 __ovld __cnfn convert_int2_rtz(float2);
	int2 __ovld __cnfn convert_int2_sat_rtz(float2);
	int2 __ovld __cnfn convert_int2_rtp(float2);
	int2 __ovld __cnfn convert_int2_sat_rtp(float2);
	int2 __ovld __cnfn convert_int2_rtn(float2);
	int2 __ovld __cnfn convert_int2_sat_rtn(float2);
	int2 __ovld __cnfn convert_int2(float2);
	int2 __ovld __cnfn convert_int2_sat(float2);
	uint2 __ovld __cnfn convert_uint2_rte(char2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(char2);
	uint2 __ovld __cnfn convert_uint2_rtz(char2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(char2);
	uint2 __ovld __cnfn convert_uint2_rtp(char2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(char2);
	uint2 __ovld __cnfn convert_uint2_rtn(char2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(char2);
	uint2 __ovld __cnfn convert_uint2(char2);
	uint2 __ovld __cnfn convert_uint2_sat(char2);
	uint2 __ovld __cnfn convert_uint2_rte(uchar2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(uchar2);
	uint2 __ovld __cnfn convert_uint2_rtz(uchar2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(uchar2);
	uint2 __ovld __cnfn convert_uint2_rtp(uchar2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(uchar2);
	uint2 __ovld __cnfn convert_uint2_rtn(uchar2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(uchar2);
	uint2 __ovld __cnfn convert_uint2(uchar2);
	uint2 __ovld __cnfn convert_uint2_sat(uchar2);
	uint2 __ovld __cnfn convert_uint2_rte(short2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(short2);
	uint2 __ovld __cnfn convert_uint2_rtz(short2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(short2);
	uint2 __ovld __cnfn convert_uint2_rtp(short2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(short2);
	uint2 __ovld __cnfn convert_uint2_rtn(short2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(short2);
	uint2 __ovld __cnfn convert_uint2(short2);
	uint2 __ovld __cnfn convert_uint2_sat(short2);
	uint2 __ovld __cnfn convert_uint2_rte(ushort2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(ushort2);
	uint2 __ovld __cnfn convert_uint2_rtz(ushort2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(ushort2);
	uint2 __ovld __cnfn convert_uint2_rtp(ushort2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(ushort2);
	uint2 __ovld __cnfn convert_uint2_rtn(ushort2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(ushort2);
	uint2 __ovld __cnfn convert_uint2(ushort2);
	uint2 __ovld __cnfn convert_uint2_sat(ushort2);
	uint2 __ovld __cnfn convert_uint2_rte(int2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(int2);
	uint2 __ovld __cnfn convert_uint2_rtz(int2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(int2);
	uint2 __ovld __cnfn convert_uint2_rtp(int2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(int2);
	uint2 __ovld __cnfn convert_uint2_rtn(int2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(int2);
	uint2 __ovld __cnfn convert_uint2(int2);
	uint2 __ovld __cnfn convert_uint2_sat(int2);
	uint2 __ovld __cnfn convert_uint2_rte(uint2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(uint2);
	uint2 __ovld __cnfn convert_uint2_rtz(uint2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(uint2);
	uint2 __ovld __cnfn convert_uint2_rtp(uint2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(uint2);
	uint2 __ovld __cnfn convert_uint2_rtn(uint2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(uint2);
	uint2 __ovld __cnfn convert_uint2(uint2);
	uint2 __ovld __cnfn convert_uint2_sat(uint2);
	uint2 __ovld __cnfn convert_uint2_rte(long2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(long2);
	uint2 __ovld __cnfn convert_uint2_rtz(long2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(long2);
	uint2 __ovld __cnfn convert_uint2_rtp(long2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(long2);
	uint2 __ovld __cnfn convert_uint2_rtn(long2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(long2);
	uint2 __ovld __cnfn convert_uint2(long2);
	uint2 __ovld __cnfn convert_uint2_sat(long2);
	uint2 __ovld __cnfn convert_uint2_rte(ulong2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(ulong2);
	uint2 __ovld __cnfn convert_uint2_rtz(ulong2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(ulong2);
	uint2 __ovld __cnfn convert_uint2_rtp(ulong2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(ulong2);
	uint2 __ovld __cnfn convert_uint2_rtn(ulong2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(ulong2);
	uint2 __ovld __cnfn convert_uint2(ulong2);
	uint2 __ovld __cnfn convert_uint2_sat(ulong2);
	uint2 __ovld __cnfn convert_uint2_rte(float2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(float2);
	uint2 __ovld __cnfn convert_uint2_rtz(float2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(float2);
	uint2 __ovld __cnfn convert_uint2_rtp(float2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(float2);
	uint2 __ovld __cnfn convert_uint2_rtn(float2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(float2);
	uint2 __ovld __cnfn convert_uint2(float2);
	uint2 __ovld __cnfn convert_uint2_sat(float2);
	long2 __ovld __cnfn convert_long2_rte(char2);
	long2 __ovld __cnfn convert_long2_sat_rte(char2);
	long2 __ovld __cnfn convert_long2_rtz(char2);
	long2 __ovld __cnfn convert_long2_sat_rtz(char2);
	long2 __ovld __cnfn convert_long2_rtp(char2);
	long2 __ovld __cnfn convert_long2_sat_rtp(char2);
	long2 __ovld __cnfn convert_long2_rtn(char2);
	long2 __ovld __cnfn convert_long2_sat_rtn(char2);
	long2 __ovld __cnfn convert_long2(char2);
	long2 __ovld __cnfn convert_long2_sat(char2);
	long2 __ovld __cnfn convert_long2_rte(uchar2);
	long2 __ovld __cnfn convert_long2_sat_rte(uchar2);
	long2 __ovld __cnfn convert_long2_rtz(uchar2);
	long2 __ovld __cnfn convert_long2_sat_rtz(uchar2);
	long2 __ovld __cnfn convert_long2_rtp(uchar2);
	long2 __ovld __cnfn convert_long2_sat_rtp(uchar2);
	long2 __ovld __cnfn convert_long2_rtn(uchar2);
	long2 __ovld __cnfn convert_long2_sat_rtn(uchar2);
	long2 __ovld __cnfn convert_long2(uchar2);
	long2 __ovld __cnfn convert_long2_sat(uchar2);
	long2 __ovld __cnfn convert_long2_rte(short2);
	long2 __ovld __cnfn convert_long2_sat_rte(short2);
	long2 __ovld __cnfn convert_long2_rtz(short2);
	long2 __ovld __cnfn convert_long2_sat_rtz(short2);
	long2 __ovld __cnfn convert_long2_rtp(short2);
	long2 __ovld __cnfn convert_long2_sat_rtp(short2);
	long2 __ovld __cnfn convert_long2_rtn(short2);
	long2 __ovld __cnfn convert_long2_sat_rtn(short2);
	long2 __ovld __cnfn convert_long2(short2);
	long2 __ovld __cnfn convert_long2_sat(short2);
	long2 __ovld __cnfn convert_long2_rte(ushort2);
	long2 __ovld __cnfn convert_long2_sat_rte(ushort2);
	long2 __ovld __cnfn convert_long2_rtz(ushort2);
	long2 __ovld __cnfn convert_long2_sat_rtz(ushort2);
	long2 __ovld __cnfn convert_long2_rtp(ushort2);
	long2 __ovld __cnfn convert_long2_sat_rtp(ushort2);
	long2 __ovld __cnfn convert_long2_rtn(ushort2);
	long2 __ovld __cnfn convert_long2_sat_rtn(ushort2);
	long2 __ovld __cnfn convert_long2(ushort2);
	long2 __ovld __cnfn convert_long2_sat(ushort2);
	long2 __ovld __cnfn convert_long2_rte(int2);
	long2 __ovld __cnfn convert_long2_sat_rte(int2);
	long2 __ovld __cnfn convert_long2_rtz(int2);
	long2 __ovld __cnfn convert_long2_sat_rtz(int2);
	long2 __ovld __cnfn convert_long2_rtp(int2);
	long2 __ovld __cnfn convert_long2_sat_rtp(int2);
	long2 __ovld __cnfn convert_long2_rtn(int2);
	long2 __ovld __cnfn convert_long2_sat_rtn(int2);
	long2 __ovld __cnfn convert_long2(int2);
	long2 __ovld __cnfn convert_long2_sat(int2);
	long2 __ovld __cnfn convert_long2_rte(uint2);
	long2 __ovld __cnfn convert_long2_sat_rte(uint2);
	long2 __ovld __cnfn convert_long2_rtz(uint2);
	long2 __ovld __cnfn convert_long2_sat_rtz(uint2);
	long2 __ovld __cnfn convert_long2_rtp(uint2);
	long2 __ovld __cnfn convert_long2_sat_rtp(uint2);
	long2 __ovld __cnfn convert_long2_rtn(uint2);
	long2 __ovld __cnfn convert_long2_sat_rtn(uint2);
	long2 __ovld __cnfn convert_long2(uint2);
	long2 __ovld __cnfn convert_long2_sat(uint2);
	long2 __ovld __cnfn convert_long2_rte(long2);
	long2 __ovld __cnfn convert_long2_sat_rte(long2);
	long2 __ovld __cnfn convert_long2_rtz(long2);
	long2 __ovld __cnfn convert_long2_sat_rtz(long2);
	long2 __ovld __cnfn convert_long2_rtp(long2);
	long2 __ovld __cnfn convert_long2_sat_rtp(long2);
	long2 __ovld __cnfn convert_long2_rtn(long2);
	long2 __ovld __cnfn convert_long2_sat_rtn(long2);
	long2 __ovld __cnfn convert_long2(long2);
	long2 __ovld __cnfn convert_long2_sat(long2);
	long2 __ovld __cnfn convert_long2_rte(ulong2);
	long2 __ovld __cnfn convert_long2_sat_rte(ulong2);
	long2 __ovld __cnfn convert_long2_rtz(ulong2);
	long2 __ovld __cnfn convert_long2_sat_rtz(ulong2);
	long2 __ovld __cnfn convert_long2_rtp(ulong2);
	long2 __ovld __cnfn convert_long2_sat_rtp(ulong2);
	long2 __ovld __cnfn convert_long2_rtn(ulong2);
	long2 __ovld __cnfn convert_long2_sat_rtn(ulong2);
	long2 __ovld __cnfn convert_long2(ulong2);
	long2 __ovld __cnfn convert_long2_sat(ulong2);
	long2 __ovld __cnfn convert_long2_rte(float2);
	long2 __ovld __cnfn convert_long2_sat_rte(float2);
	long2 __ovld __cnfn convert_long2_rtz(float2);
	long2 __ovld __cnfn convert_long2_sat_rtz(float2);
	long2 __ovld __cnfn convert_long2_rtp(float2);
	long2 __ovld __cnfn convert_long2_sat_rtp(float2);
	long2 __ovld __cnfn convert_long2_rtn(float2);
	long2 __ovld __cnfn convert_long2_sat_rtn(float2);
	long2 __ovld __cnfn convert_long2(float2);
	long2 __ovld __cnfn convert_long2_sat(float2);
	ulong2 __ovld __cnfn convert_ulong2_rte(char2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(char2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(char2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(char2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(char2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(char2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(char2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(char2);
	ulong2 __ovld __cnfn convert_ulong2(char2);
	ulong2 __ovld __cnfn convert_ulong2_sat(char2);
	ulong2 __ovld __cnfn convert_ulong2_rte(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(uchar2);
	ulong2 __ovld __cnfn convert_ulong2(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_sat(uchar2);
	ulong2 __ovld __cnfn convert_ulong2_rte(short2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(short2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(short2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(short2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(short2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(short2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(short2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(short2);
	ulong2 __ovld __cnfn convert_ulong2(short2);
	ulong2 __ovld __cnfn convert_ulong2_sat(short2);
	ulong2 __ovld __cnfn convert_ulong2_rte(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(ushort2);
	ulong2 __ovld __cnfn convert_ulong2(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_sat(ushort2);
	ulong2 __ovld __cnfn convert_ulong2_rte(int2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(int2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(int2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(int2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(int2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(int2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(int2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(int2);
	ulong2 __ovld __cnfn convert_ulong2(int2);
	ulong2 __ovld __cnfn convert_ulong2_sat(int2);
	ulong2 __ovld __cnfn convert_ulong2_rte(uint2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(uint2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(uint2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(uint2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(uint2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(uint2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(uint2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(uint2);
	ulong2 __ovld __cnfn convert_ulong2(uint2);
	ulong2 __ovld __cnfn convert_ulong2_sat(uint2);
	ulong2 __ovld __cnfn convert_ulong2_rte(long2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(long2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(long2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(long2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(long2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(long2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(long2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(long2);
	ulong2 __ovld __cnfn convert_ulong2(long2);
	ulong2 __ovld __cnfn convert_ulong2_sat(long2);
	ulong2 __ovld __cnfn convert_ulong2_rte(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(ulong2);
	ulong2 __ovld __cnfn convert_ulong2(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_sat(ulong2);
	ulong2 __ovld __cnfn convert_ulong2_rte(float2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(float2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(float2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(float2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(float2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(float2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(float2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(float2);
	ulong2 __ovld __cnfn convert_ulong2(float2);
	ulong2 __ovld __cnfn convert_ulong2_sat(float2);
	float2 __ovld __cnfn convert_float2_rte(char2);
	float2 __ovld __cnfn convert_float2_rtz(char2);
	float2 __ovld __cnfn convert_float2_rtp(char2);
	float2 __ovld __cnfn convert_float2_rtn(char2);
	float2 __ovld __cnfn convert_float2(char2);
	float2 __ovld __cnfn convert_float2_rte(uchar2);
	float2 __ovld __cnfn convert_float2_rtz(uchar2);
	float2 __ovld __cnfn convert_float2_rtp(uchar2);
	float2 __ovld __cnfn convert_float2_rtn(uchar2);
	float2 __ovld __cnfn convert_float2(uchar2);
	float2 __ovld __cnfn convert_float2_rte(short2);
	float2 __ovld __cnfn convert_float2_rtz(short2);
	float2 __ovld __cnfn convert_float2_rtp(short2);
	float2 __ovld __cnfn convert_float2_rtn(short2);
	float2 __ovld __cnfn convert_float2(short2);
	float2 __ovld __cnfn convert_float2_rte(ushort2);
	float2 __ovld __cnfn convert_float2_rtz(ushort2);
	float2 __ovld __cnfn convert_float2_rtp(ushort2);
	float2 __ovld __cnfn convert_float2_rtn(ushort2);
	float2 __ovld __cnfn convert_float2(ushort2);
	float2 __ovld __cnfn convert_float2_rte(int2);
	float2 __ovld __cnfn convert_float2_rtz(int2);
	float2 __ovld __cnfn convert_float2_rtp(int2);
	float2 __ovld __cnfn convert_float2_rtn(int2);
	float2 __ovld __cnfn convert_float2(int2);
	float2 __ovld __cnfn convert_float2_rte(uint2);
	float2 __ovld __cnfn convert_float2_rtz(uint2);
	float2 __ovld __cnfn convert_float2_rtp(uint2);
	float2 __ovld __cnfn convert_float2_rtn(uint2);
	float2 __ovld __cnfn convert_float2(uint2);
	float2 __ovld __cnfn convert_float2_rte(long2);
	float2 __ovld __cnfn convert_float2_rtz(long2);
	float2 __ovld __cnfn convert_float2_rtp(long2);
	float2 __ovld __cnfn convert_float2_rtn(long2);
	float2 __ovld __cnfn convert_float2(long2);
	float2 __ovld __cnfn convert_float2_rte(ulong2);
	float2 __ovld __cnfn convert_float2_rtz(ulong2);
	float2 __ovld __cnfn convert_float2_rtp(ulong2);
	float2 __ovld __cnfn convert_float2_rtn(ulong2);
	float2 __ovld __cnfn convert_float2(ulong2);
	float2 __ovld __cnfn convert_float2_rte(float2);
	float2 __ovld __cnfn convert_float2_rtz(float2);
	float2 __ovld __cnfn convert_float2_rtp(float2);
	float2 __ovld __cnfn convert_float2_rtn(float2);
	float2 __ovld __cnfn convert_float2(float2);
	char3 __ovld __cnfn convert_char3_rte(char3);
	char3 __ovld __cnfn convert_char3_sat_rte(char3);
	char3 __ovld __cnfn convert_char3_rtz(char3);
	char3 __ovld __cnfn convert_char3_sat_rtz(char3);
	char3 __ovld __cnfn convert_char3_rtp(char3);
	char3 __ovld __cnfn convert_char3_sat_rtp(char3);
	char3 __ovld __cnfn convert_char3_rtn(char3);
	char3 __ovld __cnfn convert_char3_sat_rtn(char3);
	char3 __ovld __cnfn convert_char3(char3);
	char3 __ovld __cnfn convert_char3_sat(char3);
	char3 __ovld __cnfn convert_char3_rte(uchar3);
	char3 __ovld __cnfn convert_char3_sat_rte(uchar3);
	char3 __ovld __cnfn convert_char3_rtz(uchar3);
	char3 __ovld __cnfn convert_char3_sat_rtz(uchar3);
	char3 __ovld __cnfn convert_char3_rtp(uchar3);
	char3 __ovld __cnfn convert_char3_sat_rtp(uchar3);
	char3 __ovld __cnfn convert_char3_rtn(uchar3);
	char3 __ovld __cnfn convert_char3_sat_rtn(uchar3);
	char3 __ovld __cnfn convert_char3(uchar3);
	char3 __ovld __cnfn convert_char3_sat(uchar3);
	char3 __ovld __cnfn convert_char3_rte(short3);
	char3 __ovld __cnfn convert_char3_sat_rte(short3);
	char3 __ovld __cnfn convert_char3_rtz(short3);
	char3 __ovld __cnfn convert_char3_sat_rtz(short3);
	char3 __ovld __cnfn convert_char3_rtp(short3);
	char3 __ovld __cnfn convert_char3_sat_rtp(short3);
	char3 __ovld __cnfn convert_char3_rtn(short3);
	char3 __ovld __cnfn convert_char3_sat_rtn(short3);
	char3 __ovld __cnfn convert_char3(short3);
	char3 __ovld __cnfn convert_char3_sat(short3);
	char3 __ovld __cnfn convert_char3_rte(ushort3);
	char3 __ovld __cnfn convert_char3_sat_rte(ushort3);
	char3 __ovld __cnfn convert_char3_rtz(ushort3);
	char3 __ovld __cnfn convert_char3_sat_rtz(ushort3);
	char3 __ovld __cnfn convert_char3_rtp(ushort3);
	char3 __ovld __cnfn convert_char3_sat_rtp(ushort3);
	char3 __ovld __cnfn convert_char3_rtn(ushort3);
	char3 __ovld __cnfn convert_char3_sat_rtn(ushort3);
	char3 __ovld __cnfn convert_char3(ushort3);
	char3 __ovld __cnfn convert_char3_sat(ushort3);
	char3 __ovld __cnfn convert_char3_rte(int3);
	char3 __ovld __cnfn convert_char3_sat_rte(int3);
	char3 __ovld __cnfn convert_char3_rtz(int3);
	char3 __ovld __cnfn convert_char3_sat_rtz(int3);
	char3 __ovld __cnfn convert_char3_rtp(int3);
	char3 __ovld __cnfn convert_char3_sat_rtp(int3);
	char3 __ovld __cnfn convert_char3_rtn(int3);
	char3 __ovld __cnfn convert_char3_sat_rtn(int3);
	char3 __ovld __cnfn convert_char3(int3);
	char3 __ovld __cnfn convert_char3_sat(int3);
	char3 __ovld __cnfn convert_char3_rte(uint3);
	char3 __ovld __cnfn convert_char3_sat_rte(uint3);
	char3 __ovld __cnfn convert_char3_rtz(uint3);
	char3 __ovld __cnfn convert_char3_sat_rtz(uint3);
	char3 __ovld __cnfn convert_char3_rtp(uint3);
	char3 __ovld __cnfn convert_char3_sat_rtp(uint3);
	char3 __ovld __cnfn convert_char3_rtn(uint3);
	char3 __ovld __cnfn convert_char3_sat_rtn(uint3);
	char3 __ovld __cnfn convert_char3(uint3);
	char3 __ovld __cnfn convert_char3_sat(uint3);
	char3 __ovld __cnfn convert_char3_rte(long3);
	char3 __ovld __cnfn convert_char3_sat_rte(long3);
	char3 __ovld __cnfn convert_char3_rtz(long3);
	char3 __ovld __cnfn convert_char3_sat_rtz(long3);
	char3 __ovld __cnfn convert_char3_rtp(long3);
	char3 __ovld __cnfn convert_char3_sat_rtp(long3);
	char3 __ovld __cnfn convert_char3_rtn(long3);
	char3 __ovld __cnfn convert_char3_sat_rtn(long3);
	char3 __ovld __cnfn convert_char3(long3);
	char3 __ovld __cnfn convert_char3_sat(long3);
	char3 __ovld __cnfn convert_char3_rte(ulong3);
	char3 __ovld __cnfn convert_char3_sat_rte(ulong3);
	char3 __ovld __cnfn convert_char3_rtz(ulong3);
	char3 __ovld __cnfn convert_char3_sat_rtz(ulong3);
	char3 __ovld __cnfn convert_char3_rtp(ulong3);
	char3 __ovld __cnfn convert_char3_sat_rtp(ulong3);
	char3 __ovld __cnfn convert_char3_rtn(ulong3);
	char3 __ovld __cnfn convert_char3_sat_rtn(ulong3);
	char3 __ovld __cnfn convert_char3(ulong3);
	char3 __ovld __cnfn convert_char3_sat(ulong3);
	char3 __ovld __cnfn convert_char3_rte(float3);
	char3 __ovld __cnfn convert_char3_sat_rte(float3);
	char3 __ovld __cnfn convert_char3_rtz(float3);
	char3 __ovld __cnfn convert_char3_sat_rtz(float3);
	char3 __ovld __cnfn convert_char3_rtp(float3);
	char3 __ovld __cnfn convert_char3_sat_rtp(float3);
	char3 __ovld __cnfn convert_char3_rtn(float3);
	char3 __ovld __cnfn convert_char3_sat_rtn(float3);
	char3 __ovld __cnfn convert_char3(float3);
	char3 __ovld __cnfn convert_char3_sat(float3);
	uchar3 __ovld __cnfn convert_uchar3_rte(char3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(char3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(char3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(char3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(char3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(char3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(char3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(char3);
	uchar3 __ovld __cnfn convert_uchar3(char3);
	uchar3 __ovld __cnfn convert_uchar3_sat(char3);
	uchar3 __ovld __cnfn convert_uchar3_rte(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(uchar3);
	uchar3 __ovld __cnfn convert_uchar3(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_sat(uchar3);
	uchar3 __ovld __cnfn convert_uchar3_rte(short3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(short3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(short3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(short3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(short3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(short3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(short3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(short3);
	uchar3 __ovld __cnfn convert_uchar3(short3);
	uchar3 __ovld __cnfn convert_uchar3_sat(short3);
	uchar3 __ovld __cnfn convert_uchar3_rte(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(ushort3);
	uchar3 __ovld __cnfn convert_uchar3(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_sat(ushort3);
	uchar3 __ovld __cnfn convert_uchar3_rte(int3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(int3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(int3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(int3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(int3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(int3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(int3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(int3);
	uchar3 __ovld __cnfn convert_uchar3(int3);
	uchar3 __ovld __cnfn convert_uchar3_sat(int3);
	uchar3 __ovld __cnfn convert_uchar3_rte(uint3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(uint3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(uint3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(uint3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(uint3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(uint3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(uint3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(uint3);
	uchar3 __ovld __cnfn convert_uchar3(uint3);
	uchar3 __ovld __cnfn convert_uchar3_sat(uint3);
	uchar3 __ovld __cnfn convert_uchar3_rte(long3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(long3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(long3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(long3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(long3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(long3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(long3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(long3);
	uchar3 __ovld __cnfn convert_uchar3(long3);
	uchar3 __ovld __cnfn convert_uchar3_sat(long3);
	uchar3 __ovld __cnfn convert_uchar3_rte(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(ulong3);
	uchar3 __ovld __cnfn convert_uchar3(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_sat(ulong3);
	uchar3 __ovld __cnfn convert_uchar3_rte(float3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(float3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(float3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(float3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(float3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(float3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(float3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(float3);
	uchar3 __ovld __cnfn convert_uchar3(float3);
	uchar3 __ovld __cnfn convert_uchar3_sat(float3);
	short3 __ovld __cnfn convert_short3_rte(char3);
	short3 __ovld __cnfn convert_short3_sat_rte(char3);
	short3 __ovld __cnfn convert_short3_rtz(char3);
	short3 __ovld __cnfn convert_short3_sat_rtz(char3);
	short3 __ovld __cnfn convert_short3_rtp(char3);
	short3 __ovld __cnfn convert_short3_sat_rtp(char3);
	short3 __ovld __cnfn convert_short3_rtn(char3);
	short3 __ovld __cnfn convert_short3_sat_rtn(char3);
	short3 __ovld __cnfn convert_short3(char3);
	short3 __ovld __cnfn convert_short3_sat(char3);
	short3 __ovld __cnfn convert_short3_rte(uchar3);
	short3 __ovld __cnfn convert_short3_sat_rte(uchar3);
	short3 __ovld __cnfn convert_short3_rtz(uchar3);
	short3 __ovld __cnfn convert_short3_sat_rtz(uchar3);
	short3 __ovld __cnfn convert_short3_rtp(uchar3);
	short3 __ovld __cnfn convert_short3_sat_rtp(uchar3);
	short3 __ovld __cnfn convert_short3_rtn(uchar3);
	short3 __ovld __cnfn convert_short3_sat_rtn(uchar3);
	short3 __ovld __cnfn convert_short3(uchar3);
	short3 __ovld __cnfn convert_short3_sat(uchar3);
	short3 __ovld __cnfn convert_short3_rte(short3);
	short3 __ovld __cnfn convert_short3_sat_rte(short3);
	short3 __ovld __cnfn convert_short3_rtz(short3);
	short3 __ovld __cnfn convert_short3_sat_rtz(short3);
	short3 __ovld __cnfn convert_short3_rtp(short3);
	short3 __ovld __cnfn convert_short3_sat_rtp(short3);
	short3 __ovld __cnfn convert_short3_rtn(short3);
	short3 __ovld __cnfn convert_short3_sat_rtn(short3);
	short3 __ovld __cnfn convert_short3(short3);
	short3 __ovld __cnfn convert_short3_sat(short3);
	short3 __ovld __cnfn convert_short3_rte(ushort3);
	short3 __ovld __cnfn convert_short3_sat_rte(ushort3);
	short3 __ovld __cnfn convert_short3_rtz(ushort3);
	short3 __ovld __cnfn convert_short3_sat_rtz(ushort3);
	short3 __ovld __cnfn convert_short3_rtp(ushort3);
	short3 __ovld __cnfn convert_short3_sat_rtp(ushort3);
	short3 __ovld __cnfn convert_short3_rtn(ushort3);
	short3 __ovld __cnfn convert_short3_sat_rtn(ushort3);
	short3 __ovld __cnfn convert_short3(ushort3);
	short3 __ovld __cnfn convert_short3_sat(ushort3);
	short3 __ovld __cnfn convert_short3_rte(int3);
	short3 __ovld __cnfn convert_short3_sat_rte(int3);
	short3 __ovld __cnfn convert_short3_rtz(int3);
	short3 __ovld __cnfn convert_short3_sat_rtz(int3);
	short3 __ovld __cnfn convert_short3_rtp(int3);
	short3 __ovld __cnfn convert_short3_sat_rtp(int3);
	short3 __ovld __cnfn convert_short3_rtn(int3);
	short3 __ovld __cnfn convert_short3_sat_rtn(int3);
	short3 __ovld __cnfn convert_short3(int3);
	short3 __ovld __cnfn convert_short3_sat(int3);
	short3 __ovld __cnfn convert_short3_rte(uint3);
	short3 __ovld __cnfn convert_short3_sat_rte(uint3);
	short3 __ovld __cnfn convert_short3_rtz(uint3);
	short3 __ovld __cnfn convert_short3_sat_rtz(uint3);
	short3 __ovld __cnfn convert_short3_rtp(uint3);
	short3 __ovld __cnfn convert_short3_sat_rtp(uint3);
	short3 __ovld __cnfn convert_short3_rtn(uint3);
	short3 __ovld __cnfn convert_short3_sat_rtn(uint3);
	short3 __ovld __cnfn convert_short3(uint3);
	short3 __ovld __cnfn convert_short3_sat(uint3);
	short3 __ovld __cnfn convert_short3_rte(long3);
	short3 __ovld __cnfn convert_short3_sat_rte(long3);
	short3 __ovld __cnfn convert_short3_rtz(long3);
	short3 __ovld __cnfn convert_short3_sat_rtz(long3);
	short3 __ovld __cnfn convert_short3_rtp(long3);
	short3 __ovld __cnfn convert_short3_sat_rtp(long3);
	short3 __ovld __cnfn convert_short3_rtn(long3);
	short3 __ovld __cnfn convert_short3_sat_rtn(long3);
	short3 __ovld __cnfn convert_short3(long3);
	short3 __ovld __cnfn convert_short3_sat(long3);
	short3 __ovld __cnfn convert_short3_rte(ulong3);
	short3 __ovld __cnfn convert_short3_sat_rte(ulong3);
	short3 __ovld __cnfn convert_short3_rtz(ulong3);
	short3 __ovld __cnfn convert_short3_sat_rtz(ulong3);
	short3 __ovld __cnfn convert_short3_rtp(ulong3);
	short3 __ovld __cnfn convert_short3_sat_rtp(ulong3);
	short3 __ovld __cnfn convert_short3_rtn(ulong3);
	short3 __ovld __cnfn convert_short3_sat_rtn(ulong3);
	short3 __ovld __cnfn convert_short3(ulong3);
	short3 __ovld __cnfn convert_short3_sat(ulong3);
	short3 __ovld __cnfn convert_short3_rte(float3);
	short3 __ovld __cnfn convert_short3_sat_rte(float3);
	short3 __ovld __cnfn convert_short3_rtz(float3);
	short3 __ovld __cnfn convert_short3_sat_rtz(float3);
	short3 __ovld __cnfn convert_short3_rtp(float3);
	short3 __ovld __cnfn convert_short3_sat_rtp(float3);
	short3 __ovld __cnfn convert_short3_rtn(float3);
	short3 __ovld __cnfn convert_short3_sat_rtn(float3);
	short3 __ovld __cnfn convert_short3(float3);
	short3 __ovld __cnfn convert_short3_sat(float3);
	ushort3 __ovld __cnfn convert_ushort3_rte(char3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(char3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(char3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(char3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(char3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(char3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(char3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(char3);
	ushort3 __ovld __cnfn convert_ushort3(char3);
	ushort3 __ovld __cnfn convert_ushort3_sat(char3);
	ushort3 __ovld __cnfn convert_ushort3_rte(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(uchar3);
	ushort3 __ovld __cnfn convert_ushort3(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_sat(uchar3);
	ushort3 __ovld __cnfn convert_ushort3_rte(short3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(short3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(short3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(short3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(short3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(short3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(short3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(short3);
	ushort3 __ovld __cnfn convert_ushort3(short3);
	ushort3 __ovld __cnfn convert_ushort3_sat(short3);
	ushort3 __ovld __cnfn convert_ushort3_rte(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(ushort3);
	ushort3 __ovld __cnfn convert_ushort3(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_sat(ushort3);
	ushort3 __ovld __cnfn convert_ushort3_rte(int3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(int3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(int3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(int3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(int3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(int3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(int3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(int3);
	ushort3 __ovld __cnfn convert_ushort3(int3);
	ushort3 __ovld __cnfn convert_ushort3_sat(int3);
	ushort3 __ovld __cnfn convert_ushort3_rte(uint3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(uint3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(uint3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(uint3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(uint3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(uint3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(uint3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(uint3);
	ushort3 __ovld __cnfn convert_ushort3(uint3);
	ushort3 __ovld __cnfn convert_ushort3_sat(uint3);
	ushort3 __ovld __cnfn convert_ushort3_rte(long3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(long3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(long3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(long3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(long3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(long3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(long3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(long3);
	ushort3 __ovld __cnfn convert_ushort3(long3);
	ushort3 __ovld __cnfn convert_ushort3_sat(long3);
	ushort3 __ovld __cnfn convert_ushort3_rte(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(ulong3);
	ushort3 __ovld __cnfn convert_ushort3(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_sat(ulong3);
	ushort3 __ovld __cnfn convert_ushort3_rte(float3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(float3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(float3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(float3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(float3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(float3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(float3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(float3);
	ushort3 __ovld __cnfn convert_ushort3(float3);
	ushort3 __ovld __cnfn convert_ushort3_sat(float3);
	int3 __ovld __cnfn convert_int3_rte(char3);
	int3 __ovld __cnfn convert_int3_sat_rte(char3);
	int3 __ovld __cnfn convert_int3_rtz(char3);
	int3 __ovld __cnfn convert_int3_sat_rtz(char3);
	int3 __ovld __cnfn convert_int3_rtp(char3);
	int3 __ovld __cnfn convert_int3_sat_rtp(char3);
	int3 __ovld __cnfn convert_int3_rtn(char3);
	int3 __ovld __cnfn convert_int3_sat_rtn(char3);
	int3 __ovld __cnfn convert_int3(char3);
	int3 __ovld __cnfn convert_int3_sat(char3);
	int3 __ovld __cnfn convert_int3_rte(uchar3);
	int3 __ovld __cnfn convert_int3_sat_rte(uchar3);
	int3 __ovld __cnfn convert_int3_rtz(uchar3);
	int3 __ovld __cnfn convert_int3_sat_rtz(uchar3);
	int3 __ovld __cnfn convert_int3_rtp(uchar3);
	int3 __ovld __cnfn convert_int3_sat_rtp(uchar3);
	int3 __ovld __cnfn convert_int3_rtn(uchar3);
	int3 __ovld __cnfn convert_int3_sat_rtn(uchar3);
	int3 __ovld __cnfn convert_int3(uchar3);
	int3 __ovld __cnfn convert_int3_sat(uchar3);
	int3 __ovld __cnfn convert_int3_rte(short3);
	int3 __ovld __cnfn convert_int3_sat_rte(short3);
	int3 __ovld __cnfn convert_int3_rtz(short3);
	int3 __ovld __cnfn convert_int3_sat_rtz(short3);
	int3 __ovld __cnfn convert_int3_rtp(short3);
	int3 __ovld __cnfn convert_int3_sat_rtp(short3);
	int3 __ovld __cnfn convert_int3_rtn(short3);
	int3 __ovld __cnfn convert_int3_sat_rtn(short3);
	int3 __ovld __cnfn convert_int3(short3);
	int3 __ovld __cnfn convert_int3_sat(short3);
	int3 __ovld __cnfn convert_int3_rte(ushort3);
	int3 __ovld __cnfn convert_int3_sat_rte(ushort3);
	int3 __ovld __cnfn convert_int3_rtz(ushort3);
	int3 __ovld __cnfn convert_int3_sat_rtz(ushort3);
	int3 __ovld __cnfn convert_int3_rtp(ushort3);
	int3 __ovld __cnfn convert_int3_sat_rtp(ushort3);
	int3 __ovld __cnfn convert_int3_rtn(ushort3);
	int3 __ovld __cnfn convert_int3_sat_rtn(ushort3);
	int3 __ovld __cnfn convert_int3(ushort3);
	int3 __ovld __cnfn convert_int3_sat(ushort3);
	int3 __ovld __cnfn convert_int3_rte(int3);
	int3 __ovld __cnfn convert_int3_sat_rte(int3);
	int3 __ovld __cnfn convert_int3_rtz(int3);
	int3 __ovld __cnfn convert_int3_sat_rtz(int3);
	int3 __ovld __cnfn convert_int3_rtp(int3);
	int3 __ovld __cnfn convert_int3_sat_rtp(int3);
	int3 __ovld __cnfn convert_int3_rtn(int3);
	int3 __ovld __cnfn convert_int3_sat_rtn(int3);
	int3 __ovld __cnfn convert_int3(int3);
	int3 __ovld __cnfn convert_int3_sat(int3);
	int3 __ovld __cnfn convert_int3_rte(uint3);
	int3 __ovld __cnfn convert_int3_sat_rte(uint3);
	int3 __ovld __cnfn convert_int3_rtz(uint3);
	int3 __ovld __cnfn convert_int3_sat_rtz(uint3);
	int3 __ovld __cnfn convert_int3_rtp(uint3);
	int3 __ovld __cnfn convert_int3_sat_rtp(uint3);
	int3 __ovld __cnfn convert_int3_rtn(uint3);
	int3 __ovld __cnfn convert_int3_sat_rtn(uint3);
	int3 __ovld __cnfn convert_int3(uint3);
	int3 __ovld __cnfn convert_int3_sat(uint3);
	int3 __ovld __cnfn convert_int3_rte(long3);
	int3 __ovld __cnfn convert_int3_sat_rte(long3);
	int3 __ovld __cnfn convert_int3_rtz(long3);
	int3 __ovld __cnfn convert_int3_sat_rtz(long3);
	int3 __ovld __cnfn convert_int3_rtp(long3);
	int3 __ovld __cnfn convert_int3_sat_rtp(long3);
	int3 __ovld __cnfn convert_int3_rtn(long3);
	int3 __ovld __cnfn convert_int3_sat_rtn(long3);
	int3 __ovld __cnfn convert_int3(long3);
	int3 __ovld __cnfn convert_int3_sat(long3);
	int3 __ovld __cnfn convert_int3_rte(ulong3);
	int3 __ovld __cnfn convert_int3_sat_rte(ulong3);
	int3 __ovld __cnfn convert_int3_rtz(ulong3);
	int3 __ovld __cnfn convert_int3_sat_rtz(ulong3);
	int3 __ovld __cnfn convert_int3_rtp(ulong3);
	int3 __ovld __cnfn convert_int3_sat_rtp(ulong3);
	int3 __ovld __cnfn convert_int3_rtn(ulong3);
	int3 __ovld __cnfn convert_int3_sat_rtn(ulong3);
	int3 __ovld __cnfn convert_int3(ulong3);
	int3 __ovld __cnfn convert_int3_sat(ulong3);
	int3 __ovld __cnfn convert_int3_rte(float3);
	int3 __ovld __cnfn convert_int3_sat_rte(float3);
	int3 __ovld __cnfn convert_int3_rtz(float3);
	int3 __ovld __cnfn convert_int3_sat_rtz(float3);
	int3 __ovld __cnfn convert_int3_rtp(float3);
	int3 __ovld __cnfn convert_int3_sat_rtp(float3);
	int3 __ovld __cnfn convert_int3_rtn(float3);
	int3 __ovld __cnfn convert_int3_sat_rtn(float3);
	int3 __ovld __cnfn convert_int3(float3);
	int3 __ovld __cnfn convert_int3_sat(float3);
	uint3 __ovld __cnfn convert_uint3_rte(char3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(char3);
	uint3 __ovld __cnfn convert_uint3_rtz(char3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(char3);
	uint3 __ovld __cnfn convert_uint3_rtp(char3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(char3);
	uint3 __ovld __cnfn convert_uint3_rtn(char3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(char3);
	uint3 __ovld __cnfn convert_uint3(char3);
	uint3 __ovld __cnfn convert_uint3_sat(char3);
	uint3 __ovld __cnfn convert_uint3_rte(uchar3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(uchar3);
	uint3 __ovld __cnfn convert_uint3_rtz(uchar3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(uchar3);
	uint3 __ovld __cnfn convert_uint3_rtp(uchar3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(uchar3);
	uint3 __ovld __cnfn convert_uint3_rtn(uchar3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(uchar3);
	uint3 __ovld __cnfn convert_uint3(uchar3);
	uint3 __ovld __cnfn convert_uint3_sat(uchar3);
	uint3 __ovld __cnfn convert_uint3_rte(short3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(short3);
	uint3 __ovld __cnfn convert_uint3_rtz(short3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(short3);
	uint3 __ovld __cnfn convert_uint3_rtp(short3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(short3);
	uint3 __ovld __cnfn convert_uint3_rtn(short3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(short3);
	uint3 __ovld __cnfn convert_uint3(short3);
	uint3 __ovld __cnfn convert_uint3_sat(short3);
	uint3 __ovld __cnfn convert_uint3_rte(ushort3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(ushort3);
	uint3 __ovld __cnfn convert_uint3_rtz(ushort3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(ushort3);
	uint3 __ovld __cnfn convert_uint3_rtp(ushort3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(ushort3);
	uint3 __ovld __cnfn convert_uint3_rtn(ushort3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(ushort3);
	uint3 __ovld __cnfn convert_uint3(ushort3);
	uint3 __ovld __cnfn convert_uint3_sat(ushort3);
	uint3 __ovld __cnfn convert_uint3_rte(int3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(int3);
	uint3 __ovld __cnfn convert_uint3_rtz(int3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(int3);
	uint3 __ovld __cnfn convert_uint3_rtp(int3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(int3);
	uint3 __ovld __cnfn convert_uint3_rtn(int3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(int3);
	uint3 __ovld __cnfn convert_uint3(int3);
	uint3 __ovld __cnfn convert_uint3_sat(int3);
	uint3 __ovld __cnfn convert_uint3_rte(uint3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(uint3);
	uint3 __ovld __cnfn convert_uint3_rtz(uint3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(uint3);
	uint3 __ovld __cnfn convert_uint3_rtp(uint3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(uint3);
	uint3 __ovld __cnfn convert_uint3_rtn(uint3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(uint3);
	uint3 __ovld __cnfn convert_uint3(uint3);
	uint3 __ovld __cnfn convert_uint3_sat(uint3);
	uint3 __ovld __cnfn convert_uint3_rte(long3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(long3);
	uint3 __ovld __cnfn convert_uint3_rtz(long3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(long3);
	uint3 __ovld __cnfn convert_uint3_rtp(long3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(long3);
	uint3 __ovld __cnfn convert_uint3_rtn(long3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(long3);
	uint3 __ovld __cnfn convert_uint3(long3);
	uint3 __ovld __cnfn convert_uint3_sat(long3);
	uint3 __ovld __cnfn convert_uint3_rte(ulong3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(ulong3);
	uint3 __ovld __cnfn convert_uint3_rtz(ulong3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(ulong3);
	uint3 __ovld __cnfn convert_uint3_rtp(ulong3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(ulong3);
	uint3 __ovld __cnfn convert_uint3_rtn(ulong3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(ulong3);
	uint3 __ovld __cnfn convert_uint3(ulong3);
	uint3 __ovld __cnfn convert_uint3_sat(ulong3);
	uint3 __ovld __cnfn convert_uint3_rte(float3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(float3);
	uint3 __ovld __cnfn convert_uint3_rtz(float3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(float3);
	uint3 __ovld __cnfn convert_uint3_rtp(float3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(float3);
	uint3 __ovld __cnfn convert_uint3_rtn(float3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(float3);
	uint3 __ovld __cnfn convert_uint3(float3);
	uint3 __ovld __cnfn convert_uint3_sat(float3);
	long3 __ovld __cnfn convert_long3_rte(char3);
	long3 __ovld __cnfn convert_long3_sat_rte(char3);
	long3 __ovld __cnfn convert_long3_rtz(char3);
	long3 __ovld __cnfn convert_long3_sat_rtz(char3);
	long3 __ovld __cnfn convert_long3_rtp(char3);
	long3 __ovld __cnfn convert_long3_sat_rtp(char3);
	long3 __ovld __cnfn convert_long3_rtn(char3);
	long3 __ovld __cnfn convert_long3_sat_rtn(char3);
	long3 __ovld __cnfn convert_long3(char3);
	long3 __ovld __cnfn convert_long3_sat(char3);
	long3 __ovld __cnfn convert_long3_rte(uchar3);
	long3 __ovld __cnfn convert_long3_sat_rte(uchar3);
	long3 __ovld __cnfn convert_long3_rtz(uchar3);
	long3 __ovld __cnfn convert_long3_sat_rtz(uchar3);
	long3 __ovld __cnfn convert_long3_rtp(uchar3);
	long3 __ovld __cnfn convert_long3_sat_rtp(uchar3);
	long3 __ovld __cnfn convert_long3_rtn(uchar3);
	long3 __ovld __cnfn convert_long3_sat_rtn(uchar3);
	long3 __ovld __cnfn convert_long3(uchar3);
	long3 __ovld __cnfn convert_long3_sat(uchar3);
	long3 __ovld __cnfn convert_long3_rte(short3);
	long3 __ovld __cnfn convert_long3_sat_rte(short3);
	long3 __ovld __cnfn convert_long3_rtz(short3);
	long3 __ovld __cnfn convert_long3_sat_rtz(short3);
	long3 __ovld __cnfn convert_long3_rtp(short3);
	long3 __ovld __cnfn convert_long3_sat_rtp(short3);
	long3 __ovld __cnfn convert_long3_rtn(short3);
	long3 __ovld __cnfn convert_long3_sat_rtn(short3);
	long3 __ovld __cnfn convert_long3(short3);
	long3 __ovld __cnfn convert_long3_sat(short3);
	long3 __ovld __cnfn convert_long3_rte(ushort3);
	long3 __ovld __cnfn convert_long3_sat_rte(ushort3);
	long3 __ovld __cnfn convert_long3_rtz(ushort3);
	long3 __ovld __cnfn convert_long3_sat_rtz(ushort3);
	long3 __ovld __cnfn convert_long3_rtp(ushort3);
	long3 __ovld __cnfn convert_long3_sat_rtp(ushort3);
	long3 __ovld __cnfn convert_long3_rtn(ushort3);
	long3 __ovld __cnfn convert_long3_sat_rtn(ushort3);
	long3 __ovld __cnfn convert_long3(ushort3);
	long3 __ovld __cnfn convert_long3_sat(ushort3);
	long3 __ovld __cnfn convert_long3_rte(int3);
	long3 __ovld __cnfn convert_long3_sat_rte(int3);
	long3 __ovld __cnfn convert_long3_rtz(int3);
	long3 __ovld __cnfn convert_long3_sat_rtz(int3);
	long3 __ovld __cnfn convert_long3_rtp(int3);
	long3 __ovld __cnfn convert_long3_sat_rtp(int3);
	long3 __ovld __cnfn convert_long3_rtn(int3);
	long3 __ovld __cnfn convert_long3_sat_rtn(int3);
	long3 __ovld __cnfn convert_long3(int3);
	long3 __ovld __cnfn convert_long3_sat(int3);
	long3 __ovld __cnfn convert_long3_rte(uint3);
	long3 __ovld __cnfn convert_long3_sat_rte(uint3);
	long3 __ovld __cnfn convert_long3_rtz(uint3);
	long3 __ovld __cnfn convert_long3_sat_rtz(uint3);
	long3 __ovld __cnfn convert_long3_rtp(uint3);
	long3 __ovld __cnfn convert_long3_sat_rtp(uint3);
	long3 __ovld __cnfn convert_long3_rtn(uint3);
	long3 __ovld __cnfn convert_long3_sat_rtn(uint3);
	long3 __ovld __cnfn convert_long3(uint3);
	long3 __ovld __cnfn convert_long3_sat(uint3);
	long3 __ovld __cnfn convert_long3_rte(long3);
	long3 __ovld __cnfn convert_long3_sat_rte(long3);
	long3 __ovld __cnfn convert_long3_rtz(long3);
	long3 __ovld __cnfn convert_long3_sat_rtz(long3);
	long3 __ovld __cnfn convert_long3_rtp(long3);
	long3 __ovld __cnfn convert_long3_sat_rtp(long3);
	long3 __ovld __cnfn convert_long3_rtn(long3);
	long3 __ovld __cnfn convert_long3_sat_rtn(long3);
	long3 __ovld __cnfn convert_long3(long3);
	long3 __ovld __cnfn convert_long3_sat(long3);
	long3 __ovld __cnfn convert_long3_rte(ulong3);
	long3 __ovld __cnfn convert_long3_sat_rte(ulong3);
	long3 __ovld __cnfn convert_long3_rtz(ulong3);
	long3 __ovld __cnfn convert_long3_sat_rtz(ulong3);
	long3 __ovld __cnfn convert_long3_rtp(ulong3);
	long3 __ovld __cnfn convert_long3_sat_rtp(ulong3);
	long3 __ovld __cnfn convert_long3_rtn(ulong3);
	long3 __ovld __cnfn convert_long3_sat_rtn(ulong3);
	long3 __ovld __cnfn convert_long3(ulong3);
	long3 __ovld __cnfn convert_long3_sat(ulong3);
	long3 __ovld __cnfn convert_long3_rte(float3);
	long3 __ovld __cnfn convert_long3_sat_rte(float3);
	long3 __ovld __cnfn convert_long3_rtz(float3);
	long3 __ovld __cnfn convert_long3_sat_rtz(float3);
	long3 __ovld __cnfn convert_long3_rtp(float3);
	long3 __ovld __cnfn convert_long3_sat_rtp(float3);
	long3 __ovld __cnfn convert_long3_rtn(float3);
	long3 __ovld __cnfn convert_long3_sat_rtn(float3);
	long3 __ovld __cnfn convert_long3(float3);
	long3 __ovld __cnfn convert_long3_sat(float3);
	ulong3 __ovld __cnfn convert_ulong3_rte(char3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(char3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(char3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(char3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(char3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(char3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(char3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(char3);
	ulong3 __ovld __cnfn convert_ulong3(char3);
	ulong3 __ovld __cnfn convert_ulong3_sat(char3);
	ulong3 __ovld __cnfn convert_ulong3_rte(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(uchar3);
	ulong3 __ovld __cnfn convert_ulong3(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_sat(uchar3);
	ulong3 __ovld __cnfn convert_ulong3_rte(short3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(short3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(short3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(short3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(short3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(short3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(short3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(short3);
	ulong3 __ovld __cnfn convert_ulong3(short3);
	ulong3 __ovld __cnfn convert_ulong3_sat(short3);
	ulong3 __ovld __cnfn convert_ulong3_rte(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(ushort3);
	ulong3 __ovld __cnfn convert_ulong3(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_sat(ushort3);
	ulong3 __ovld __cnfn convert_ulong3_rte(int3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(int3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(int3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(int3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(int3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(int3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(int3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(int3);
	ulong3 __ovld __cnfn convert_ulong3(int3);
	ulong3 __ovld __cnfn convert_ulong3_sat(int3);
	ulong3 __ovld __cnfn convert_ulong3_rte(uint3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(uint3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(uint3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(uint3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(uint3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(uint3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(uint3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(uint3);
	ulong3 __ovld __cnfn convert_ulong3(uint3);
	ulong3 __ovld __cnfn convert_ulong3_sat(uint3);
	ulong3 __ovld __cnfn convert_ulong3_rte(long3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(long3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(long3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(long3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(long3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(long3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(long3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(long3);
	ulong3 __ovld __cnfn convert_ulong3(long3);
	ulong3 __ovld __cnfn convert_ulong3_sat(long3);
	ulong3 __ovld __cnfn convert_ulong3_rte(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(ulong3);
	ulong3 __ovld __cnfn convert_ulong3(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_sat(ulong3);
	ulong3 __ovld __cnfn convert_ulong3_rte(float3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(float3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(float3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(float3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(float3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(float3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(float3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(float3);
	ulong3 __ovld __cnfn convert_ulong3(float3);
	ulong3 __ovld __cnfn convert_ulong3_sat(float3);
	float3 __ovld __cnfn convert_float3_rte(char3);
	float3 __ovld __cnfn convert_float3_rtz(char3);
	float3 __ovld __cnfn convert_float3_rtp(char3);
	float3 __ovld __cnfn convert_float3_rtn(char3);
	float3 __ovld __cnfn convert_float3(char3);
	float3 __ovld __cnfn convert_float3_rte(uchar3);
	float3 __ovld __cnfn convert_float3_rtz(uchar3);
	float3 __ovld __cnfn convert_float3_rtp(uchar3);
	float3 __ovld __cnfn convert_float3_rtn(uchar3);
	float3 __ovld __cnfn convert_float3(uchar3);
	float3 __ovld __cnfn convert_float3_rte(short3);
	float3 __ovld __cnfn convert_float3_rtz(short3);
	float3 __ovld __cnfn convert_float3_rtp(short3);
	float3 __ovld __cnfn convert_float3_rtn(short3);
	float3 __ovld __cnfn convert_float3(short3);
	float3 __ovld __cnfn convert_float3_rte(ushort3);
	float3 __ovld __cnfn convert_float3_rtz(ushort3);
	float3 __ovld __cnfn convert_float3_rtp(ushort3);
	float3 __ovld __cnfn convert_float3_rtn(ushort3);
	float3 __ovld __cnfn convert_float3(ushort3);
	float3 __ovld __cnfn convert_float3_rte(int3);
	float3 __ovld __cnfn convert_float3_rtz(int3);
	float3 __ovld __cnfn convert_float3_rtp(int3);
	float3 __ovld __cnfn convert_float3_rtn(int3);
	float3 __ovld __cnfn convert_float3(int3);
	float3 __ovld __cnfn convert_float3_rte(uint3);
	float3 __ovld __cnfn convert_float3_rtz(uint3);
	float3 __ovld __cnfn convert_float3_rtp(uint3);
	float3 __ovld __cnfn convert_float3_rtn(uint3);
	float3 __ovld __cnfn convert_float3(uint3);
	float3 __ovld __cnfn convert_float3_rte(long3);
	float3 __ovld __cnfn convert_float3_rtz(long3);
	float3 __ovld __cnfn convert_float3_rtp(long3);
	float3 __ovld __cnfn convert_float3_rtn(long3);
	float3 __ovld __cnfn convert_float3(long3);
	float3 __ovld __cnfn convert_float3_rte(ulong3);
	float3 __ovld __cnfn convert_float3_rtz(ulong3);
	float3 __ovld __cnfn convert_float3_rtp(ulong3);
	float3 __ovld __cnfn convert_float3_rtn(ulong3);
	float3 __ovld __cnfn convert_float3(ulong3);
	float3 __ovld __cnfn convert_float3_rte(float3);
	float3 __ovld __cnfn convert_float3_rtz(float3);
	float3 __ovld __cnfn convert_float3_rtp(float3);
	float3 __ovld __cnfn convert_float3_rtn(float3);
	float3 __ovld __cnfn convert_float3(float3);
	char4 __ovld __cnfn convert_char4_rte(char4);
	char4 __ovld __cnfn convert_char4_sat_rte(char4);
	char4 __ovld __cnfn convert_char4_rtz(char4);
	char4 __ovld __cnfn convert_char4_sat_rtz(char4);
	char4 __ovld __cnfn convert_char4_rtp(char4);
	char4 __ovld __cnfn convert_char4_sat_rtp(char4);
	char4 __ovld __cnfn convert_char4_rtn(char4);
	char4 __ovld __cnfn convert_char4_sat_rtn(char4);
	char4 __ovld __cnfn convert_char4(char4);
	char4 __ovld __cnfn convert_char4_sat(char4);
	char4 __ovld __cnfn convert_char4_rte(uchar4);
	char4 __ovld __cnfn convert_char4_sat_rte(uchar4);
	char4 __ovld __cnfn convert_char4_rtz(uchar4);
	char4 __ovld __cnfn convert_char4_sat_rtz(uchar4);
	char4 __ovld __cnfn convert_char4_rtp(uchar4);
	char4 __ovld __cnfn convert_char4_sat_rtp(uchar4);
	char4 __ovld __cnfn convert_char4_rtn(uchar4);
	char4 __ovld __cnfn convert_char4_sat_rtn(uchar4);
	char4 __ovld __cnfn convert_char4(uchar4);
	char4 __ovld __cnfn convert_char4_sat(uchar4);
	char4 __ovld __cnfn convert_char4_rte(short4);
	char4 __ovld __cnfn convert_char4_sat_rte(short4);
	char4 __ovld __cnfn convert_char4_rtz(short4);
	char4 __ovld __cnfn convert_char4_sat_rtz(short4);
	char4 __ovld __cnfn convert_char4_rtp(short4);
	char4 __ovld __cnfn convert_char4_sat_rtp(short4);
	char4 __ovld __cnfn convert_char4_rtn(short4);
	char4 __ovld __cnfn convert_char4_sat_rtn(short4);
	char4 __ovld __cnfn convert_char4(short4);
	char4 __ovld __cnfn convert_char4_sat(short4);
	char4 __ovld __cnfn convert_char4_rte(ushort4);
	char4 __ovld __cnfn convert_char4_sat_rte(ushort4);
	char4 __ovld __cnfn convert_char4_rtz(ushort4);
	char4 __ovld __cnfn convert_char4_sat_rtz(ushort4);
	char4 __ovld __cnfn convert_char4_rtp(ushort4);
	char4 __ovld __cnfn convert_char4_sat_rtp(ushort4);
	char4 __ovld __cnfn convert_char4_rtn(ushort4);
	char4 __ovld __cnfn convert_char4_sat_rtn(ushort4);
	char4 __ovld __cnfn convert_char4(ushort4);
	char4 __ovld __cnfn convert_char4_sat(ushort4);
	char4 __ovld __cnfn convert_char4_rte(int4);
	char4 __ovld __cnfn convert_char4_sat_rte(int4);
	char4 __ovld __cnfn convert_char4_rtz(int4);
	char4 __ovld __cnfn convert_char4_sat_rtz(int4);
	char4 __ovld __cnfn convert_char4_rtp(int4);
	char4 __ovld __cnfn convert_char4_sat_rtp(int4);
	char4 __ovld __cnfn convert_char4_rtn(int4);
	char4 __ovld __cnfn convert_char4_sat_rtn(int4);
	char4 __ovld __cnfn convert_char4(int4);
	char4 __ovld __cnfn convert_char4_sat(int4);
	char4 __ovld __cnfn convert_char4_rte(uint4);
	char4 __ovld __cnfn convert_char4_sat_rte(uint4);
	char4 __ovld __cnfn convert_char4_rtz(uint4);
	char4 __ovld __cnfn convert_char4_sat_rtz(uint4);
	char4 __ovld __cnfn convert_char4_rtp(uint4);
	char4 __ovld __cnfn convert_char4_sat_rtp(uint4);
	char4 __ovld __cnfn convert_char4_rtn(uint4);
	char4 __ovld __cnfn convert_char4_sat_rtn(uint4);
	char4 __ovld __cnfn convert_char4(uint4);
	char4 __ovld __cnfn convert_char4_sat(uint4);
	char4 __ovld __cnfn convert_char4_rte(long4);
	char4 __ovld __cnfn convert_char4_sat_rte(long4);
	char4 __ovld __cnfn convert_char4_rtz(long4);
	char4 __ovld __cnfn convert_char4_sat_rtz(long4);
	char4 __ovld __cnfn convert_char4_rtp(long4);
	char4 __ovld __cnfn convert_char4_sat_rtp(long4);
	char4 __ovld __cnfn convert_char4_rtn(long4);
	char4 __ovld __cnfn convert_char4_sat_rtn(long4);
	char4 __ovld __cnfn convert_char4(long4);
	char4 __ovld __cnfn convert_char4_sat(long4);
	char4 __ovld __cnfn convert_char4_rte(ulong4);
	char4 __ovld __cnfn convert_char4_sat_rte(ulong4);
	char4 __ovld __cnfn convert_char4_rtz(ulong4);
	char4 __ovld __cnfn convert_char4_sat_rtz(ulong4);
	char4 __ovld __cnfn convert_char4_rtp(ulong4);
	char4 __ovld __cnfn convert_char4_sat_rtp(ulong4);
	char4 __ovld __cnfn convert_char4_rtn(ulong4);
	char4 __ovld __cnfn convert_char4_sat_rtn(ulong4);
	char4 __ovld __cnfn convert_char4(ulong4);
	char4 __ovld __cnfn convert_char4_sat(ulong4);
	char4 __ovld __cnfn convert_char4_rte(float4);
	char4 __ovld __cnfn convert_char4_sat_rte(float4);
	char4 __ovld __cnfn convert_char4_rtz(float4);
	char4 __ovld __cnfn convert_char4_sat_rtz(float4);
	char4 __ovld __cnfn convert_char4_rtp(float4);
	char4 __ovld __cnfn convert_char4_sat_rtp(float4);
	char4 __ovld __cnfn convert_char4_rtn(float4);
	char4 __ovld __cnfn convert_char4_sat_rtn(float4);
	char4 __ovld __cnfn convert_char4(float4);
	char4 __ovld __cnfn convert_char4_sat(float4);
	uchar4 __ovld __cnfn convert_uchar4_rte(char4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(char4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(char4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(char4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(char4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(char4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(char4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(char4);
	uchar4 __ovld __cnfn convert_uchar4(char4);
	uchar4 __ovld __cnfn convert_uchar4_sat(char4);
	uchar4 __ovld __cnfn convert_uchar4_rte(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(uchar4);
	uchar4 __ovld __cnfn convert_uchar4(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_sat(uchar4);
	uchar4 __ovld __cnfn convert_uchar4_rte(short4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(short4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(short4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(short4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(short4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(short4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(short4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(short4);
	uchar4 __ovld __cnfn convert_uchar4(short4);
	uchar4 __ovld __cnfn convert_uchar4_sat(short4);
	uchar4 __ovld __cnfn convert_uchar4_rte(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(ushort4);
	uchar4 __ovld __cnfn convert_uchar4(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_sat(ushort4);
	uchar4 __ovld __cnfn convert_uchar4_rte(int4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(int4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(int4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(int4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(int4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(int4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(int4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(int4);
	uchar4 __ovld __cnfn convert_uchar4(int4);
	uchar4 __ovld __cnfn convert_uchar4_sat(int4);
	uchar4 __ovld __cnfn convert_uchar4_rte(uint4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(uint4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(uint4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(uint4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(uint4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(uint4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(uint4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(uint4);
	uchar4 __ovld __cnfn convert_uchar4(uint4);
	uchar4 __ovld __cnfn convert_uchar4_sat(uint4);
	uchar4 __ovld __cnfn convert_uchar4_rte(long4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(long4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(long4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(long4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(long4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(long4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(long4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(long4);
	uchar4 __ovld __cnfn convert_uchar4(long4);
	uchar4 __ovld __cnfn convert_uchar4_sat(long4);
	uchar4 __ovld __cnfn convert_uchar4_rte(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(ulong4);
	uchar4 __ovld __cnfn convert_uchar4(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_sat(ulong4);
	uchar4 __ovld __cnfn convert_uchar4_rte(float4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(float4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(float4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(float4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(float4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(float4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(float4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(float4);
	uchar4 __ovld __cnfn convert_uchar4(float4);
	uchar4 __ovld __cnfn convert_uchar4_sat(float4);
	short4 __ovld __cnfn convert_short4_rte(char4);
	short4 __ovld __cnfn convert_short4_sat_rte(char4);
	short4 __ovld __cnfn convert_short4_rtz(char4);
	short4 __ovld __cnfn convert_short4_sat_rtz(char4);
	short4 __ovld __cnfn convert_short4_rtp(char4);
	short4 __ovld __cnfn convert_short4_sat_rtp(char4);
	short4 __ovld __cnfn convert_short4_rtn(char4);
	short4 __ovld __cnfn convert_short4_sat_rtn(char4);
	short4 __ovld __cnfn convert_short4(char4);
	short4 __ovld __cnfn convert_short4_sat(char4);
	short4 __ovld __cnfn convert_short4_rte(uchar4);
	short4 __ovld __cnfn convert_short4_sat_rte(uchar4);
	short4 __ovld __cnfn convert_short4_rtz(uchar4);
	short4 __ovld __cnfn convert_short4_sat_rtz(uchar4);
	short4 __ovld __cnfn convert_short4_rtp(uchar4);
	short4 __ovld __cnfn convert_short4_sat_rtp(uchar4);
	short4 __ovld __cnfn convert_short4_rtn(uchar4);
	short4 __ovld __cnfn convert_short4_sat_rtn(uchar4);
	short4 __ovld __cnfn convert_short4(uchar4);
	short4 __ovld __cnfn convert_short4_sat(uchar4);
	short4 __ovld __cnfn convert_short4_rte(short4);
	short4 __ovld __cnfn convert_short4_sat_rte(short4);
	short4 __ovld __cnfn convert_short4_rtz(short4);
	short4 __ovld __cnfn convert_short4_sat_rtz(short4);
	short4 __ovld __cnfn convert_short4_rtp(short4);
	short4 __ovld __cnfn convert_short4_sat_rtp(short4);
	short4 __ovld __cnfn convert_short4_rtn(short4);
	short4 __ovld __cnfn convert_short4_sat_rtn(short4);
	short4 __ovld __cnfn convert_short4(short4);
	short4 __ovld __cnfn convert_short4_sat(short4);
	short4 __ovld __cnfn convert_short4_rte(ushort4);
	short4 __ovld __cnfn convert_short4_sat_rte(ushort4);
	short4 __ovld __cnfn convert_short4_rtz(ushort4);
	short4 __ovld __cnfn convert_short4_sat_rtz(ushort4);
	short4 __ovld __cnfn convert_short4_rtp(ushort4);
	short4 __ovld __cnfn convert_short4_sat_rtp(ushort4);
	short4 __ovld __cnfn convert_short4_rtn(ushort4);
	short4 __ovld __cnfn convert_short4_sat_rtn(ushort4);
	short4 __ovld __cnfn convert_short4(ushort4);
	short4 __ovld __cnfn convert_short4_sat(ushort4);
	short4 __ovld __cnfn convert_short4_rte(int4);
	short4 __ovld __cnfn convert_short4_sat_rte(int4);
	short4 __ovld __cnfn convert_short4_rtz(int4);
	short4 __ovld __cnfn convert_short4_sat_rtz(int4);
	short4 __ovld __cnfn convert_short4_rtp(int4);
	short4 __ovld __cnfn convert_short4_sat_rtp(int4);
	short4 __ovld __cnfn convert_short4_rtn(int4);
	short4 __ovld __cnfn convert_short4_sat_rtn(int4);
	short4 __ovld __cnfn convert_short4(int4);
	short4 __ovld __cnfn convert_short4_sat(int4);
	short4 __ovld __cnfn convert_short4_rte(uint4);
	short4 __ovld __cnfn convert_short4_sat_rte(uint4);
	short4 __ovld __cnfn convert_short4_rtz(uint4);
	short4 __ovld __cnfn convert_short4_sat_rtz(uint4);
	short4 __ovld __cnfn convert_short4_rtp(uint4);
	short4 __ovld __cnfn convert_short4_sat_rtp(uint4);
	short4 __ovld __cnfn convert_short4_rtn(uint4);
	short4 __ovld __cnfn convert_short4_sat_rtn(uint4);
	short4 __ovld __cnfn convert_short4(uint4);
	short4 __ovld __cnfn convert_short4_sat(uint4);
	short4 __ovld __cnfn convert_short4_rte(long4);
	short4 __ovld __cnfn convert_short4_sat_rte(long4);
	short4 __ovld __cnfn convert_short4_rtz(long4);
	short4 __ovld __cnfn convert_short4_sat_rtz(long4);
	short4 __ovld __cnfn convert_short4_rtp(long4);
	short4 __ovld __cnfn convert_short4_sat_rtp(long4);
	short4 __ovld __cnfn convert_short4_rtn(long4);
	short4 __ovld __cnfn convert_short4_sat_rtn(long4);
	short4 __ovld __cnfn convert_short4(long4);
	short4 __ovld __cnfn convert_short4_sat(long4);
	short4 __ovld __cnfn convert_short4_rte(ulong4);
	short4 __ovld __cnfn convert_short4_sat_rte(ulong4);
	short4 __ovld __cnfn convert_short4_rtz(ulong4);
	short4 __ovld __cnfn convert_short4_sat_rtz(ulong4);
	short4 __ovld __cnfn convert_short4_rtp(ulong4);
	short4 __ovld __cnfn convert_short4_sat_rtp(ulong4);
	short4 __ovld __cnfn convert_short4_rtn(ulong4);
	short4 __ovld __cnfn convert_short4_sat_rtn(ulong4);
	short4 __ovld __cnfn convert_short4(ulong4);
	short4 __ovld __cnfn convert_short4_sat(ulong4);
	short4 __ovld __cnfn convert_short4_rte(float4);
	short4 __ovld __cnfn convert_short4_sat_rte(float4);
	short4 __ovld __cnfn convert_short4_rtz(float4);
	short4 __ovld __cnfn convert_short4_sat_rtz(float4);
	short4 __ovld __cnfn convert_short4_rtp(float4);
	short4 __ovld __cnfn convert_short4_sat_rtp(float4);
	short4 __ovld __cnfn convert_short4_rtn(float4);
	short4 __ovld __cnfn convert_short4_sat_rtn(float4);
	short4 __ovld __cnfn convert_short4(float4);
	short4 __ovld __cnfn convert_short4_sat(float4);
	ushort4 __ovld __cnfn convert_ushort4_rte(char4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(char4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(char4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(char4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(char4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(char4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(char4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(char4);
	ushort4 __ovld __cnfn convert_ushort4(char4);
	ushort4 __ovld __cnfn convert_ushort4_sat(char4);
	ushort4 __ovld __cnfn convert_ushort4_rte(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(uchar4);
	ushort4 __ovld __cnfn convert_ushort4(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_sat(uchar4);
	ushort4 __ovld __cnfn convert_ushort4_rte(short4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(short4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(short4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(short4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(short4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(short4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(short4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(short4);
	ushort4 __ovld __cnfn convert_ushort4(short4);
	ushort4 __ovld __cnfn convert_ushort4_sat(short4);
	ushort4 __ovld __cnfn convert_ushort4_rte(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(ushort4);
	ushort4 __ovld __cnfn convert_ushort4(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_sat(ushort4);
	ushort4 __ovld __cnfn convert_ushort4_rte(int4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(int4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(int4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(int4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(int4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(int4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(int4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(int4);
	ushort4 __ovld __cnfn convert_ushort4(int4);
	ushort4 __ovld __cnfn convert_ushort4_sat(int4);
	ushort4 __ovld __cnfn convert_ushort4_rte(uint4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(uint4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(uint4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(uint4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(uint4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(uint4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(uint4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(uint4);
	ushort4 __ovld __cnfn convert_ushort4(uint4);
	ushort4 __ovld __cnfn convert_ushort4_sat(uint4);
	ushort4 __ovld __cnfn convert_ushort4_rte(long4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(long4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(long4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(long4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(long4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(long4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(long4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(long4);
	ushort4 __ovld __cnfn convert_ushort4(long4);
	ushort4 __ovld __cnfn convert_ushort4_sat(long4);
	ushort4 __ovld __cnfn convert_ushort4_rte(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(ulong4);
	ushort4 __ovld __cnfn convert_ushort4(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_sat(ulong4);
	ushort4 __ovld __cnfn convert_ushort4_rte(float4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(float4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(float4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(float4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(float4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(float4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(float4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(float4);
	ushort4 __ovld __cnfn convert_ushort4(float4);
	ushort4 __ovld __cnfn convert_ushort4_sat(float4);
	int4 __ovld __cnfn convert_int4_rte(char4);
	int4 __ovld __cnfn convert_int4_sat_rte(char4);
	int4 __ovld __cnfn convert_int4_rtz(char4);
	int4 __ovld __cnfn convert_int4_sat_rtz(char4);
	int4 __ovld __cnfn convert_int4_rtp(char4);
	int4 __ovld __cnfn convert_int4_sat_rtp(char4);
	int4 __ovld __cnfn convert_int4_rtn(char4);
	int4 __ovld __cnfn convert_int4_sat_rtn(char4);
	int4 __ovld __cnfn convert_int4(char4);
	int4 __ovld __cnfn convert_int4_sat(char4);
	int4 __ovld __cnfn convert_int4_rte(uchar4);
	int4 __ovld __cnfn convert_int4_sat_rte(uchar4);
	int4 __ovld __cnfn convert_int4_rtz(uchar4);
	int4 __ovld __cnfn convert_int4_sat_rtz(uchar4);
	int4 __ovld __cnfn convert_int4_rtp(uchar4);
	int4 __ovld __cnfn convert_int4_sat_rtp(uchar4);
	int4 __ovld __cnfn convert_int4_rtn(uchar4);
	int4 __ovld __cnfn convert_int4_sat_rtn(uchar4);
	int4 __ovld __cnfn convert_int4(uchar4);
	int4 __ovld __cnfn convert_int4_sat(uchar4);
	int4 __ovld __cnfn convert_int4_rte(short4);
	int4 __ovld __cnfn convert_int4_sat_rte(short4);
	int4 __ovld __cnfn convert_int4_rtz(short4);
	int4 __ovld __cnfn convert_int4_sat_rtz(short4);
	int4 __ovld __cnfn convert_int4_rtp(short4);
	int4 __ovld __cnfn convert_int4_sat_rtp(short4);
	int4 __ovld __cnfn convert_int4_rtn(short4);
	int4 __ovld __cnfn convert_int4_sat_rtn(short4);
	int4 __ovld __cnfn convert_int4(short4);
	int4 __ovld __cnfn convert_int4_sat(short4);
	int4 __ovld __cnfn convert_int4_rte(ushort4);
	int4 __ovld __cnfn convert_int4_sat_rte(ushort4);
	int4 __ovld __cnfn convert_int4_rtz(ushort4);
	int4 __ovld __cnfn convert_int4_sat_rtz(ushort4);
	int4 __ovld __cnfn convert_int4_rtp(ushort4);
	int4 __ovld __cnfn convert_int4_sat_rtp(ushort4);
	int4 __ovld __cnfn convert_int4_rtn(ushort4);
	int4 __ovld __cnfn convert_int4_sat_rtn(ushort4);
	int4 __ovld __cnfn convert_int4(ushort4);
	int4 __ovld __cnfn convert_int4_sat(ushort4);
	int4 __ovld __cnfn convert_int4_rte(int4);
	int4 __ovld __cnfn convert_int4_sat_rte(int4);
	int4 __ovld __cnfn convert_int4_rtz(int4);
	int4 __ovld __cnfn convert_int4_sat_rtz(int4);
	int4 __ovld __cnfn convert_int4_rtp(int4);
	int4 __ovld __cnfn convert_int4_sat_rtp(int4);
	int4 __ovld __cnfn convert_int4_rtn(int4);
	int4 __ovld __cnfn convert_int4_sat_rtn(int4);
	int4 __ovld __cnfn convert_int4(int4);
	int4 __ovld __cnfn convert_int4_sat(int4);
	int4 __ovld __cnfn convert_int4_rte(uint4);
	int4 __ovld __cnfn convert_int4_sat_rte(uint4);
	int4 __ovld __cnfn convert_int4_rtz(uint4);
	int4 __ovld __cnfn convert_int4_sat_rtz(uint4);
	int4 __ovld __cnfn convert_int4_rtp(uint4);
	int4 __ovld __cnfn convert_int4_sat_rtp(uint4);
	int4 __ovld __cnfn convert_int4_rtn(uint4);
	int4 __ovld __cnfn convert_int4_sat_rtn(uint4);
	int4 __ovld __cnfn convert_int4(uint4);
	int4 __ovld __cnfn convert_int4_sat(uint4);
	int4 __ovld __cnfn convert_int4_rte(long4);
	int4 __ovld __cnfn convert_int4_sat_rte(long4);
	int4 __ovld __cnfn convert_int4_rtz(long4);
	int4 __ovld __cnfn convert_int4_sat_rtz(long4);
	int4 __ovld __cnfn convert_int4_rtp(long4);
	int4 __ovld __cnfn convert_int4_sat_rtp(long4);
	int4 __ovld __cnfn convert_int4_rtn(long4);
	int4 __ovld __cnfn convert_int4_sat_rtn(long4);
	int4 __ovld __cnfn convert_int4(long4);
	int4 __ovld __cnfn convert_int4_sat(long4);
	int4 __ovld __cnfn convert_int4_rte(ulong4);
	int4 __ovld __cnfn convert_int4_sat_rte(ulong4);
	int4 __ovld __cnfn convert_int4_rtz(ulong4);
	int4 __ovld __cnfn convert_int4_sat_rtz(ulong4);
	int4 __ovld __cnfn convert_int4_rtp(ulong4);
	int4 __ovld __cnfn convert_int4_sat_rtp(ulong4);
	int4 __ovld __cnfn convert_int4_rtn(ulong4);
	int4 __ovld __cnfn convert_int4_sat_rtn(ulong4);
	int4 __ovld __cnfn convert_int4(ulong4);
	int4 __ovld __cnfn convert_int4_sat(ulong4);
	int4 __ovld __cnfn convert_int4_rte(float4);
	int4 __ovld __cnfn convert_int4_sat_rte(float4);
	int4 __ovld __cnfn convert_int4_rtz(float4);
	int4 __ovld __cnfn convert_int4_sat_rtz(float4);
	int4 __ovld __cnfn convert_int4_rtp(float4);
	int4 __ovld __cnfn convert_int4_sat_rtp(float4);
	int4 __ovld __cnfn convert_int4_rtn(float4);
	int4 __ovld __cnfn convert_int4_sat_rtn(float4);
	int4 __ovld __cnfn convert_int4(float4);
	int4 __ovld __cnfn convert_int4_sat(float4);
	uint4 __ovld __cnfn convert_uint4_rte(char4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(char4);
	uint4 __ovld __cnfn convert_uint4_rtz(char4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(char4);
	uint4 __ovld __cnfn convert_uint4_rtp(char4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(char4);
	uint4 __ovld __cnfn convert_uint4_rtn(char4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(char4);
	uint4 __ovld __cnfn convert_uint4(char4);
	uint4 __ovld __cnfn convert_uint4_sat(char4);
	uint4 __ovld __cnfn convert_uint4_rte(uchar4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(uchar4);
	uint4 __ovld __cnfn convert_uint4_rtz(uchar4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(uchar4);
	uint4 __ovld __cnfn convert_uint4_rtp(uchar4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(uchar4);
	uint4 __ovld __cnfn convert_uint4_rtn(uchar4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(uchar4);
	uint4 __ovld __cnfn convert_uint4(uchar4);
	uint4 __ovld __cnfn convert_uint4_sat(uchar4);
	uint4 __ovld __cnfn convert_uint4_rte(short4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(short4);
	uint4 __ovld __cnfn convert_uint4_rtz(short4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(short4);
	uint4 __ovld __cnfn convert_uint4_rtp(short4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(short4);
	uint4 __ovld __cnfn convert_uint4_rtn(short4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(short4);
	uint4 __ovld __cnfn convert_uint4(short4);
	uint4 __ovld __cnfn convert_uint4_sat(short4);
	uint4 __ovld __cnfn convert_uint4_rte(ushort4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(ushort4);
	uint4 __ovld __cnfn convert_uint4_rtz(ushort4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(ushort4);
	uint4 __ovld __cnfn convert_uint4_rtp(ushort4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(ushort4);
	uint4 __ovld __cnfn convert_uint4_rtn(ushort4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(ushort4);
	uint4 __ovld __cnfn convert_uint4(ushort4);
	uint4 __ovld __cnfn convert_uint4_sat(ushort4);
	uint4 __ovld __cnfn convert_uint4_rte(int4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(int4);
	uint4 __ovld __cnfn convert_uint4_rtz(int4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(int4);
	uint4 __ovld __cnfn convert_uint4_rtp(int4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(int4);
	uint4 __ovld __cnfn convert_uint4_rtn(int4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(int4);
	uint4 __ovld __cnfn convert_uint4(int4);
	uint4 __ovld __cnfn convert_uint4_sat(int4);
	uint4 __ovld __cnfn convert_uint4_rte(uint4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(uint4);
	uint4 __ovld __cnfn convert_uint4_rtz(uint4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(uint4);
	uint4 __ovld __cnfn convert_uint4_rtp(uint4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(uint4);
	uint4 __ovld __cnfn convert_uint4_rtn(uint4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(uint4);
	uint4 __ovld __cnfn convert_uint4(uint4);
	uint4 __ovld __cnfn convert_uint4_sat(uint4);
	uint4 __ovld __cnfn convert_uint4_rte(long4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(long4);
	uint4 __ovld __cnfn convert_uint4_rtz(long4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(long4);
	uint4 __ovld __cnfn convert_uint4_rtp(long4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(long4);
	uint4 __ovld __cnfn convert_uint4_rtn(long4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(long4);
	uint4 __ovld __cnfn convert_uint4(long4);
	uint4 __ovld __cnfn convert_uint4_sat(long4);
	uint4 __ovld __cnfn convert_uint4_rte(ulong4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(ulong4);
	uint4 __ovld __cnfn convert_uint4_rtz(ulong4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(ulong4);
	uint4 __ovld __cnfn convert_uint4_rtp(ulong4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(ulong4);
	uint4 __ovld __cnfn convert_uint4_rtn(ulong4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(ulong4);
	uint4 __ovld __cnfn convert_uint4(ulong4);
	uint4 __ovld __cnfn convert_uint4_sat(ulong4);
	uint4 __ovld __cnfn convert_uint4_rte(float4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(float4);
	uint4 __ovld __cnfn convert_uint4_rtz(float4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(float4);
	uint4 __ovld __cnfn convert_uint4_rtp(float4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(float4);
	uint4 __ovld __cnfn convert_uint4_rtn(float4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(float4);
	uint4 __ovld __cnfn convert_uint4(float4);
	uint4 __ovld __cnfn convert_uint4_sat(float4);
	long4 __ovld __cnfn convert_long4_rte(char4);
	long4 __ovld __cnfn convert_long4_sat_rte(char4);
	long4 __ovld __cnfn convert_long4_rtz(char4);
	long4 __ovld __cnfn convert_long4_sat_rtz(char4);
	long4 __ovld __cnfn convert_long4_rtp(char4);
	long4 __ovld __cnfn convert_long4_sat_rtp(char4);
	long4 __ovld __cnfn convert_long4_rtn(char4);
	long4 __ovld __cnfn convert_long4_sat_rtn(char4);
	long4 __ovld __cnfn convert_long4(char4);
	long4 __ovld __cnfn convert_long4_sat(char4);
	long4 __ovld __cnfn convert_long4_rte(uchar4);
	long4 __ovld __cnfn convert_long4_sat_rte(uchar4);
	long4 __ovld __cnfn convert_long4_rtz(uchar4);
	long4 __ovld __cnfn convert_long4_sat_rtz(uchar4);
	long4 __ovld __cnfn convert_long4_rtp(uchar4);
	long4 __ovld __cnfn convert_long4_sat_rtp(uchar4);
	long4 __ovld __cnfn convert_long4_rtn(uchar4);
	long4 __ovld __cnfn convert_long4_sat_rtn(uchar4);
	long4 __ovld __cnfn convert_long4(uchar4);
	long4 __ovld __cnfn convert_long4_sat(uchar4);
	long4 __ovld __cnfn convert_long4_rte(short4);
	long4 __ovld __cnfn convert_long4_sat_rte(short4);
	long4 __ovld __cnfn convert_long4_rtz(short4);
	long4 __ovld __cnfn convert_long4_sat_rtz(short4);
	long4 __ovld __cnfn convert_long4_rtp(short4);
	long4 __ovld __cnfn convert_long4_sat_rtp(short4);
	long4 __ovld __cnfn convert_long4_rtn(short4);
	long4 __ovld __cnfn convert_long4_sat_rtn(short4);
	long4 __ovld __cnfn convert_long4(short4);
	long4 __ovld __cnfn convert_long4_sat(short4);
	long4 __ovld __cnfn convert_long4_rte(ushort4);
	long4 __ovld __cnfn convert_long4_sat_rte(ushort4);
	long4 __ovld __cnfn convert_long4_rtz(ushort4);
	long4 __ovld __cnfn convert_long4_sat_rtz(ushort4);
	long4 __ovld __cnfn convert_long4_rtp(ushort4);
	long4 __ovld __cnfn convert_long4_sat_rtp(ushort4);
	long4 __ovld __cnfn convert_long4_rtn(ushort4);
	long4 __ovld __cnfn convert_long4_sat_rtn(ushort4);
	long4 __ovld __cnfn convert_long4(ushort4);
	long4 __ovld __cnfn convert_long4_sat(ushort4);
	long4 __ovld __cnfn convert_long4_rte(int4);
	long4 __ovld __cnfn convert_long4_sat_rte(int4);
	long4 __ovld __cnfn convert_long4_rtz(int4);
	long4 __ovld __cnfn convert_long4_sat_rtz(int4);
	long4 __ovld __cnfn convert_long4_rtp(int4);
	long4 __ovld __cnfn convert_long4_sat_rtp(int4);
	long4 __ovld __cnfn convert_long4_rtn(int4);
	long4 __ovld __cnfn convert_long4_sat_rtn(int4);
	long4 __ovld __cnfn convert_long4(int4);
	long4 __ovld __cnfn convert_long4_sat(int4);
	long4 __ovld __cnfn convert_long4_rte(uint4);
	long4 __ovld __cnfn convert_long4_sat_rte(uint4);
	long4 __ovld __cnfn convert_long4_rtz(uint4);
	long4 __ovld __cnfn convert_long4_sat_rtz(uint4);
	long4 __ovld __cnfn convert_long4_rtp(uint4);
	long4 __ovld __cnfn convert_long4_sat_rtp(uint4);
	long4 __ovld __cnfn convert_long4_rtn(uint4);
	long4 __ovld __cnfn convert_long4_sat_rtn(uint4);
	long4 __ovld __cnfn convert_long4(uint4);
	long4 __ovld __cnfn convert_long4_sat(uint4);
	long4 __ovld __cnfn convert_long4_rte(long4);
	long4 __ovld __cnfn convert_long4_sat_rte(long4);
	long4 __ovld __cnfn convert_long4_rtz(long4);
	long4 __ovld __cnfn convert_long4_sat_rtz(long4);
	long4 __ovld __cnfn convert_long4_rtp(long4);
	long4 __ovld __cnfn convert_long4_sat_rtp(long4);
	long4 __ovld __cnfn convert_long4_rtn(long4);
	long4 __ovld __cnfn convert_long4_sat_rtn(long4);
	long4 __ovld __cnfn convert_long4(long4);
	long4 __ovld __cnfn convert_long4_sat(long4);
	long4 __ovld __cnfn convert_long4_rte(ulong4);
	long4 __ovld __cnfn convert_long4_sat_rte(ulong4);
	long4 __ovld __cnfn convert_long4_rtz(ulong4);
	long4 __ovld __cnfn convert_long4_sat_rtz(ulong4);
	long4 __ovld __cnfn convert_long4_rtp(ulong4);
	long4 __ovld __cnfn convert_long4_sat_rtp(ulong4);
	long4 __ovld __cnfn convert_long4_rtn(ulong4);
	long4 __ovld __cnfn convert_long4_sat_rtn(ulong4);
	long4 __ovld __cnfn convert_long4(ulong4);
	long4 __ovld __cnfn convert_long4_sat(ulong4);
	long4 __ovld __cnfn convert_long4_rte(float4);
	long4 __ovld __cnfn convert_long4_sat_rte(float4);
	long4 __ovld __cnfn convert_long4_rtz(float4);
	long4 __ovld __cnfn convert_long4_sat_rtz(float4);
	long4 __ovld __cnfn convert_long4_rtp(float4);
	long4 __ovld __cnfn convert_long4_sat_rtp(float4);
	long4 __ovld __cnfn convert_long4_rtn(float4);
	long4 __ovld __cnfn convert_long4_sat_rtn(float4);
	long4 __ovld __cnfn convert_long4(float4);
	long4 __ovld __cnfn convert_long4_sat(float4);
	ulong4 __ovld __cnfn convert_ulong4_rte(char4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(char4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(char4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(char4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(char4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(char4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(char4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(char4);
	ulong4 __ovld __cnfn convert_ulong4(char4);
	ulong4 __ovld __cnfn convert_ulong4_sat(char4);
	ulong4 __ovld __cnfn convert_ulong4_rte(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(uchar4);
	ulong4 __ovld __cnfn convert_ulong4(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_sat(uchar4);
	ulong4 __ovld __cnfn convert_ulong4_rte(short4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(short4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(short4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(short4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(short4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(short4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(short4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(short4);
	ulong4 __ovld __cnfn convert_ulong4(short4);
	ulong4 __ovld __cnfn convert_ulong4_sat(short4);
	ulong4 __ovld __cnfn convert_ulong4_rte(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(ushort4);
	ulong4 __ovld __cnfn convert_ulong4(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_sat(ushort4);
	ulong4 __ovld __cnfn convert_ulong4_rte(int4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(int4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(int4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(int4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(int4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(int4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(int4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(int4);
	ulong4 __ovld __cnfn convert_ulong4(int4);
	ulong4 __ovld __cnfn convert_ulong4_sat(int4);
	ulong4 __ovld __cnfn convert_ulong4_rte(uint4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(uint4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(uint4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(uint4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(uint4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(uint4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(uint4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(uint4);
	ulong4 __ovld __cnfn convert_ulong4(uint4);
	ulong4 __ovld __cnfn convert_ulong4_sat(uint4);
	ulong4 __ovld __cnfn convert_ulong4_rte(long4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(long4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(long4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(long4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(long4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(long4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(long4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(long4);
	ulong4 __ovld __cnfn convert_ulong4(long4);
	ulong4 __ovld __cnfn convert_ulong4_sat(long4);
	ulong4 __ovld __cnfn convert_ulong4_rte(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(ulong4);
	ulong4 __ovld __cnfn convert_ulong4(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_sat(ulong4);
	ulong4 __ovld __cnfn convert_ulong4_rte(float4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(float4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(float4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(float4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(float4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(float4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(float4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(float4);
	ulong4 __ovld __cnfn convert_ulong4(float4);
	ulong4 __ovld __cnfn convert_ulong4_sat(float4);
	float4 __ovld __cnfn convert_float4_rte(char4);
	float4 __ovld __cnfn convert_float4_rtz(char4);
	float4 __ovld __cnfn convert_float4_rtp(char4);
	float4 __ovld __cnfn convert_float4_rtn(char4);
	float4 __ovld __cnfn convert_float4(char4);
	float4 __ovld __cnfn convert_float4_rte(uchar4);
	float4 __ovld __cnfn convert_float4_rtz(uchar4);
	float4 __ovld __cnfn convert_float4_rtp(uchar4);
	float4 __ovld __cnfn convert_float4_rtn(uchar4);
	float4 __ovld __cnfn convert_float4(uchar4);
	float4 __ovld __cnfn convert_float4_rte(short4);
	float4 __ovld __cnfn convert_float4_rtz(short4);
	float4 __ovld __cnfn convert_float4_rtp(short4);
	float4 __ovld __cnfn convert_float4_rtn(short4);
	float4 __ovld __cnfn convert_float4(short4);
	float4 __ovld __cnfn convert_float4_rte(ushort4);
	float4 __ovld __cnfn convert_float4_rtz(ushort4);
	float4 __ovld __cnfn convert_float4_rtp(ushort4);
	float4 __ovld __cnfn convert_float4_rtn(ushort4);
	float4 __ovld __cnfn convert_float4(ushort4);
	float4 __ovld __cnfn convert_float4_rte(int4);
	float4 __ovld __cnfn convert_float4_rtz(int4);
	float4 __ovld __cnfn convert_float4_rtp(int4);
	float4 __ovld __cnfn convert_float4_rtn(int4);
	float4 __ovld __cnfn convert_float4(int4);
	float4 __ovld __cnfn convert_float4_rte(uint4);
	float4 __ovld __cnfn convert_float4_rtz(uint4);
	float4 __ovld __cnfn convert_float4_rtp(uint4);
	float4 __ovld __cnfn convert_float4_rtn(uint4);
	float4 __ovld __cnfn convert_float4(uint4);
	float4 __ovld __cnfn convert_float4_rte(long4);
	float4 __ovld __cnfn convert_float4_rtz(long4);
	float4 __ovld __cnfn convert_float4_rtp(long4);
	float4 __ovld __cnfn convert_float4_rtn(long4);
	float4 __ovld __cnfn convert_float4(long4);
	float4 __ovld __cnfn convert_float4_rte(ulong4);
	float4 __ovld __cnfn convert_float4_rtz(ulong4);
	float4 __ovld __cnfn convert_float4_rtp(ulong4);
	float4 __ovld __cnfn convert_float4_rtn(ulong4);
	float4 __ovld __cnfn convert_float4(ulong4);
	float4 __ovld __cnfn convert_float4_rte(float4);
	float4 __ovld __cnfn convert_float4_rtz(float4);
	float4 __ovld __cnfn convert_float4_rtp(float4);
	float4 __ovld __cnfn convert_float4_rtn(float4);
	float4 __ovld __cnfn convert_float4(float4);
	char8 __ovld __cnfn convert_char8_rte(char8);
	char8 __ovld __cnfn convert_char8_sat_rte(char8);
	char8 __ovld __cnfn convert_char8_rtz(char8);
	char8 __ovld __cnfn convert_char8_sat_rtz(char8);
	char8 __ovld __cnfn convert_char8_rtp(char8);
	char8 __ovld __cnfn convert_char8_sat_rtp(char8);
	char8 __ovld __cnfn convert_char8_rtn(char8);
	char8 __ovld __cnfn convert_char8_sat_rtn(char8);
	char8 __ovld __cnfn convert_char8(char8);
	char8 __ovld __cnfn convert_char8_sat(char8);
	char8 __ovld __cnfn convert_char8_rte(uchar8);
	char8 __ovld __cnfn convert_char8_sat_rte(uchar8);
	char8 __ovld __cnfn convert_char8_rtz(uchar8);
	char8 __ovld __cnfn convert_char8_sat_rtz(uchar8);
	char8 __ovld __cnfn convert_char8_rtp(uchar8);
	char8 __ovld __cnfn convert_char8_sat_rtp(uchar8);
	char8 __ovld __cnfn convert_char8_rtn(uchar8);
	char8 __ovld __cnfn convert_char8_sat_rtn(uchar8);
	char8 __ovld __cnfn convert_char8(uchar8);
	char8 __ovld __cnfn convert_char8_sat(uchar8);
	char8 __ovld __cnfn convert_char8_rte(short8);
	char8 __ovld __cnfn convert_char8_sat_rte(short8);
	char8 __ovld __cnfn convert_char8_rtz(short8);
	char8 __ovld __cnfn convert_char8_sat_rtz(short8);
	char8 __ovld __cnfn convert_char8_rtp(short8);
	char8 __ovld __cnfn convert_char8_sat_rtp(short8);
	char8 __ovld __cnfn convert_char8_rtn(short8);
	char8 __ovld __cnfn convert_char8_sat_rtn(short8);
	char8 __ovld __cnfn convert_char8(short8);
	char8 __ovld __cnfn convert_char8_sat(short8);
	char8 __ovld __cnfn convert_char8_rte(ushort8);
	char8 __ovld __cnfn convert_char8_sat_rte(ushort8);
	char8 __ovld __cnfn convert_char8_rtz(ushort8);
	char8 __ovld __cnfn convert_char8_sat_rtz(ushort8);
	char8 __ovld __cnfn convert_char8_rtp(ushort8);
	char8 __ovld __cnfn convert_char8_sat_rtp(ushort8);
	char8 __ovld __cnfn convert_char8_rtn(ushort8);
	char8 __ovld __cnfn convert_char8_sat_rtn(ushort8);
	char8 __ovld __cnfn convert_char8(ushort8);
	char8 __ovld __cnfn convert_char8_sat(ushort8);
	char8 __ovld __cnfn convert_char8_rte(int8);
	char8 __ovld __cnfn convert_char8_sat_rte(int8);
	char8 __ovld __cnfn convert_char8_rtz(int8);
	char8 __ovld __cnfn convert_char8_sat_rtz(int8);
	char8 __ovld __cnfn convert_char8_rtp(int8);
	char8 __ovld __cnfn convert_char8_sat_rtp(int8);
	char8 __ovld __cnfn convert_char8_rtn(int8);
	char8 __ovld __cnfn convert_char8_sat_rtn(int8);
	char8 __ovld __cnfn convert_char8(int8);
	char8 __ovld __cnfn convert_char8_sat(int8);
	char8 __ovld __cnfn convert_char8_rte(uint8);
	char8 __ovld __cnfn convert_char8_sat_rte(uint8);
	char8 __ovld __cnfn convert_char8_rtz(uint8);
	char8 __ovld __cnfn convert_char8_sat_rtz(uint8);
	char8 __ovld __cnfn convert_char8_rtp(uint8);
	char8 __ovld __cnfn convert_char8_sat_rtp(uint8);
	char8 __ovld __cnfn convert_char8_rtn(uint8);
	char8 __ovld __cnfn convert_char8_sat_rtn(uint8);
	char8 __ovld __cnfn convert_char8(uint8);
	char8 __ovld __cnfn convert_char8_sat(uint8);
	char8 __ovld __cnfn convert_char8_rte(long8);
	char8 __ovld __cnfn convert_char8_sat_rte(long8);
	char8 __ovld __cnfn convert_char8_rtz(long8);
	char8 __ovld __cnfn convert_char8_sat_rtz(long8);
	char8 __ovld __cnfn convert_char8_rtp(long8);
	char8 __ovld __cnfn convert_char8_sat_rtp(long8);
	char8 __ovld __cnfn convert_char8_rtn(long8);
	char8 __ovld __cnfn convert_char8_sat_rtn(long8);
	char8 __ovld __cnfn convert_char8(long8);
	char8 __ovld __cnfn convert_char8_sat(long8);
	char8 __ovld __cnfn convert_char8_rte(ulong8);
	char8 __ovld __cnfn convert_char8_sat_rte(ulong8);
	char8 __ovld __cnfn convert_char8_rtz(ulong8);
	char8 __ovld __cnfn convert_char8_sat_rtz(ulong8);
	char8 __ovld __cnfn convert_char8_rtp(ulong8);
	char8 __ovld __cnfn convert_char8_sat_rtp(ulong8);
	char8 __ovld __cnfn convert_char8_rtn(ulong8);
	char8 __ovld __cnfn convert_char8_sat_rtn(ulong8);
	char8 __ovld __cnfn convert_char8(ulong8);
	char8 __ovld __cnfn convert_char8_sat(ulong8);
	char8 __ovld __cnfn convert_char8_rte(float8);
	char8 __ovld __cnfn convert_char8_sat_rte(float8);
	char8 __ovld __cnfn convert_char8_rtz(float8);
	char8 __ovld __cnfn convert_char8_sat_rtz(float8);
	char8 __ovld __cnfn convert_char8_rtp(float8);
	char8 __ovld __cnfn convert_char8_sat_rtp(float8);
	char8 __ovld __cnfn convert_char8_rtn(float8);
	char8 __ovld __cnfn convert_char8_sat_rtn(float8);
	char8 __ovld __cnfn convert_char8(float8);
	char8 __ovld __cnfn convert_char8_sat(float8);
	uchar8 __ovld __cnfn convert_uchar8_rte(char8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(char8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(char8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(char8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(char8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(char8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(char8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(char8);
	uchar8 __ovld __cnfn convert_uchar8(char8);
	uchar8 __ovld __cnfn convert_uchar8_sat(char8);
	uchar8 __ovld __cnfn convert_uchar8_rte(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(uchar8);
	uchar8 __ovld __cnfn convert_uchar8(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_sat(uchar8);
	uchar8 __ovld __cnfn convert_uchar8_rte(short8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(short8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(short8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(short8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(short8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(short8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(short8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(short8);
	uchar8 __ovld __cnfn convert_uchar8(short8);
	uchar8 __ovld __cnfn convert_uchar8_sat(short8);
	uchar8 __ovld __cnfn convert_uchar8_rte(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(ushort8);
	uchar8 __ovld __cnfn convert_uchar8(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_sat(ushort8);
	uchar8 __ovld __cnfn convert_uchar8_rte(int8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(int8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(int8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(int8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(int8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(int8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(int8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(int8);
	uchar8 __ovld __cnfn convert_uchar8(int8);
	uchar8 __ovld __cnfn convert_uchar8_sat(int8);
	uchar8 __ovld __cnfn convert_uchar8_rte(uint8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(uint8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(uint8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(uint8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(uint8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(uint8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(uint8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(uint8);
	uchar8 __ovld __cnfn convert_uchar8(uint8);
	uchar8 __ovld __cnfn convert_uchar8_sat(uint8);
	uchar8 __ovld __cnfn convert_uchar8_rte(long8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(long8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(long8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(long8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(long8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(long8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(long8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(long8);
	uchar8 __ovld __cnfn convert_uchar8(long8);
	uchar8 __ovld __cnfn convert_uchar8_sat(long8);
	uchar8 __ovld __cnfn convert_uchar8_rte(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(ulong8);
	uchar8 __ovld __cnfn convert_uchar8(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_sat(ulong8);
	uchar8 __ovld __cnfn convert_uchar8_rte(float8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(float8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(float8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(float8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(float8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(float8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(float8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(float8);
	uchar8 __ovld __cnfn convert_uchar8(float8);
	uchar8 __ovld __cnfn convert_uchar8_sat(float8);
	short8 __ovld __cnfn convert_short8_rte(char8);
	short8 __ovld __cnfn convert_short8_sat_rte(char8);
	short8 __ovld __cnfn convert_short8_rtz(char8);
	short8 __ovld __cnfn convert_short8_sat_rtz(char8);
	short8 __ovld __cnfn convert_short8_rtp(char8);
	short8 __ovld __cnfn convert_short8_sat_rtp(char8);
	short8 __ovld __cnfn convert_short8_rtn(char8);
	short8 __ovld __cnfn convert_short8_sat_rtn(char8);
	short8 __ovld __cnfn convert_short8(char8);
	short8 __ovld __cnfn convert_short8_sat(char8);
	short8 __ovld __cnfn convert_short8_rte(uchar8);
	short8 __ovld __cnfn convert_short8_sat_rte(uchar8);
	short8 __ovld __cnfn convert_short8_rtz(uchar8);
	short8 __ovld __cnfn convert_short8_sat_rtz(uchar8);
	short8 __ovld __cnfn convert_short8_rtp(uchar8);
	short8 __ovld __cnfn convert_short8_sat_rtp(uchar8);
	short8 __ovld __cnfn convert_short8_rtn(uchar8);
	short8 __ovld __cnfn convert_short8_sat_rtn(uchar8);
	short8 __ovld __cnfn convert_short8(uchar8);
	short8 __ovld __cnfn convert_short8_sat(uchar8);
	short8 __ovld __cnfn convert_short8_rte(short8);
	short8 __ovld __cnfn convert_short8_sat_rte(short8);
	short8 __ovld __cnfn convert_short8_rtz(short8);
	short8 __ovld __cnfn convert_short8_sat_rtz(short8);
	short8 __ovld __cnfn convert_short8_rtp(short8);
	short8 __ovld __cnfn convert_short8_sat_rtp(short8);
	short8 __ovld __cnfn convert_short8_rtn(short8);
	short8 __ovld __cnfn convert_short8_sat_rtn(short8);
	short8 __ovld __cnfn convert_short8(short8);
	short8 __ovld __cnfn convert_short8_sat(short8);
	short8 __ovld __cnfn convert_short8_rte(ushort8);
	short8 __ovld __cnfn convert_short8_sat_rte(ushort8);
	short8 __ovld __cnfn convert_short8_rtz(ushort8);
	short8 __ovld __cnfn convert_short8_sat_rtz(ushort8);
	short8 __ovld __cnfn convert_short8_rtp(ushort8);
	short8 __ovld __cnfn convert_short8_sat_rtp(ushort8);
	short8 __ovld __cnfn convert_short8_rtn(ushort8);
	short8 __ovld __cnfn convert_short8_sat_rtn(ushort8);
	short8 __ovld __cnfn convert_short8(ushort8);
	short8 __ovld __cnfn convert_short8_sat(ushort8);
	short8 __ovld __cnfn convert_short8_rte(int8);
	short8 __ovld __cnfn convert_short8_sat_rte(int8);
	short8 __ovld __cnfn convert_short8_rtz(int8);
	short8 __ovld __cnfn convert_short8_sat_rtz(int8);
	short8 __ovld __cnfn convert_short8_rtp(int8);
	short8 __ovld __cnfn convert_short8_sat_rtp(int8);
	short8 __ovld __cnfn convert_short8_rtn(int8);
	short8 __ovld __cnfn convert_short8_sat_rtn(int8);
	short8 __ovld __cnfn convert_short8(int8);
	short8 __ovld __cnfn convert_short8_sat(int8);
	short8 __ovld __cnfn convert_short8_rte(uint8);
	short8 __ovld __cnfn convert_short8_sat_rte(uint8);
	short8 __ovld __cnfn convert_short8_rtz(uint8);
	short8 __ovld __cnfn convert_short8_sat_rtz(uint8);
	short8 __ovld __cnfn convert_short8_rtp(uint8);
	short8 __ovld __cnfn convert_short8_sat_rtp(uint8);
	short8 __ovld __cnfn convert_short8_rtn(uint8);
	short8 __ovld __cnfn convert_short8_sat_rtn(uint8);
	short8 __ovld __cnfn convert_short8(uint8);
	short8 __ovld __cnfn convert_short8_sat(uint8);
	short8 __ovld __cnfn convert_short8_rte(long8);
	short8 __ovld __cnfn convert_short8_sat_rte(long8);
	short8 __ovld __cnfn convert_short8_rtz(long8);
	short8 __ovld __cnfn convert_short8_sat_rtz(long8);
	short8 __ovld __cnfn convert_short8_rtp(long8);
	short8 __ovld __cnfn convert_short8_sat_rtp(long8);
	short8 __ovld __cnfn convert_short8_rtn(long8);
	short8 __ovld __cnfn convert_short8_sat_rtn(long8);
	short8 __ovld __cnfn convert_short8(long8);
	short8 __ovld __cnfn convert_short8_sat(long8);
	short8 __ovld __cnfn convert_short8_rte(ulong8);
	short8 __ovld __cnfn convert_short8_sat_rte(ulong8);
	short8 __ovld __cnfn convert_short8_rtz(ulong8);
	short8 __ovld __cnfn convert_short8_sat_rtz(ulong8);
	short8 __ovld __cnfn convert_short8_rtp(ulong8);
	short8 __ovld __cnfn convert_short8_sat_rtp(ulong8);
	short8 __ovld __cnfn convert_short8_rtn(ulong8);
	short8 __ovld __cnfn convert_short8_sat_rtn(ulong8);
	short8 __ovld __cnfn convert_short8(ulong8);
	short8 __ovld __cnfn convert_short8_sat(ulong8);
	short8 __ovld __cnfn convert_short8_rte(float8);
	short8 __ovld __cnfn convert_short8_sat_rte(float8);
	short8 __ovld __cnfn convert_short8_rtz(float8);
	short8 __ovld __cnfn convert_short8_sat_rtz(float8);
	short8 __ovld __cnfn convert_short8_rtp(float8);
	short8 __ovld __cnfn convert_short8_sat_rtp(float8);
	short8 __ovld __cnfn convert_short8_rtn(float8);
	short8 __ovld __cnfn convert_short8_sat_rtn(float8);
	short8 __ovld __cnfn convert_short8(float8);
	short8 __ovld __cnfn convert_short8_sat(float8);
	ushort8 __ovld __cnfn convert_ushort8_rte(char8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(char8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(char8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(char8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(char8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(char8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(char8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(char8);
	ushort8 __ovld __cnfn convert_ushort8(char8);
	ushort8 __ovld __cnfn convert_ushort8_sat(char8);
	ushort8 __ovld __cnfn convert_ushort8_rte(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(uchar8);
	ushort8 __ovld __cnfn convert_ushort8(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_sat(uchar8);
	ushort8 __ovld __cnfn convert_ushort8_rte(short8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(short8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(short8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(short8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(short8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(short8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(short8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(short8);
	ushort8 __ovld __cnfn convert_ushort8(short8);
	ushort8 __ovld __cnfn convert_ushort8_sat(short8);
	ushort8 __ovld __cnfn convert_ushort8_rte(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(ushort8);
	ushort8 __ovld __cnfn convert_ushort8(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_sat(ushort8);
	ushort8 __ovld __cnfn convert_ushort8_rte(int8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(int8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(int8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(int8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(int8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(int8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(int8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(int8);
	ushort8 __ovld __cnfn convert_ushort8(int8);
	ushort8 __ovld __cnfn convert_ushort8_sat(int8);
	ushort8 __ovld __cnfn convert_ushort8_rte(uint8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(uint8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(uint8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(uint8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(uint8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(uint8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(uint8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(uint8);
	ushort8 __ovld __cnfn convert_ushort8(uint8);
	ushort8 __ovld __cnfn convert_ushort8_sat(uint8);
	ushort8 __ovld __cnfn convert_ushort8_rte(long8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(long8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(long8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(long8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(long8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(long8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(long8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(long8);
	ushort8 __ovld __cnfn convert_ushort8(long8);
	ushort8 __ovld __cnfn convert_ushort8_sat(long8);
	ushort8 __ovld __cnfn convert_ushort8_rte(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(ulong8);
	ushort8 __ovld __cnfn convert_ushort8(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_sat(ulong8);
	ushort8 __ovld __cnfn convert_ushort8_rte(float8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(float8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(float8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(float8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(float8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(float8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(float8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(float8);
	ushort8 __ovld __cnfn convert_ushort8(float8);
	ushort8 __ovld __cnfn convert_ushort8_sat(float8);
	int8 __ovld __cnfn convert_int8_rte(char8);
	int8 __ovld __cnfn convert_int8_sat_rte(char8);
	int8 __ovld __cnfn convert_int8_rtz(char8);
	int8 __ovld __cnfn convert_int8_sat_rtz(char8);
	int8 __ovld __cnfn convert_int8_rtp(char8);
	int8 __ovld __cnfn convert_int8_sat_rtp(char8);
	int8 __ovld __cnfn convert_int8_rtn(char8);
	int8 __ovld __cnfn convert_int8_sat_rtn(char8);
	int8 __ovld __cnfn convert_int8(char8);
	int8 __ovld __cnfn convert_int8_sat(char8);
	int8 __ovld __cnfn convert_int8_rte(uchar8);
	int8 __ovld __cnfn convert_int8_sat_rte(uchar8);
	int8 __ovld __cnfn convert_int8_rtz(uchar8);
	int8 __ovld __cnfn convert_int8_sat_rtz(uchar8);
	int8 __ovld __cnfn convert_int8_rtp(uchar8);
	int8 __ovld __cnfn convert_int8_sat_rtp(uchar8);
	int8 __ovld __cnfn convert_int8_rtn(uchar8);
	int8 __ovld __cnfn convert_int8_sat_rtn(uchar8);
	int8 __ovld __cnfn convert_int8(uchar8);
	int8 __ovld __cnfn convert_int8_sat(uchar8);
	int8 __ovld __cnfn convert_int8_rte(short8);
	int8 __ovld __cnfn convert_int8_sat_rte(short8);
	int8 __ovld __cnfn convert_int8_rtz(short8);
	int8 __ovld __cnfn convert_int8_sat_rtz(short8);
	int8 __ovld __cnfn convert_int8_rtp(short8);
	int8 __ovld __cnfn convert_int8_sat_rtp(short8);
	int8 __ovld __cnfn convert_int8_rtn(short8);
	int8 __ovld __cnfn convert_int8_sat_rtn(short8);
	int8 __ovld __cnfn convert_int8(short8);
	int8 __ovld __cnfn convert_int8_sat(short8);
	int8 __ovld __cnfn convert_int8_rte(ushort8);
	int8 __ovld __cnfn convert_int8_sat_rte(ushort8);
	int8 __ovld __cnfn convert_int8_rtz(ushort8);
	int8 __ovld __cnfn convert_int8_sat_rtz(ushort8);
	int8 __ovld __cnfn convert_int8_rtp(ushort8);
	int8 __ovld __cnfn convert_int8_sat_rtp(ushort8);
	int8 __ovld __cnfn convert_int8_rtn(ushort8);
	int8 __ovld __cnfn convert_int8_sat_rtn(ushort8);
	int8 __ovld __cnfn convert_int8(ushort8);
	int8 __ovld __cnfn convert_int8_sat(ushort8);
	int8 __ovld __cnfn convert_int8_rte(int8);
	int8 __ovld __cnfn convert_int8_sat_rte(int8);
	int8 __ovld __cnfn convert_int8_rtz(int8);
	int8 __ovld __cnfn convert_int8_sat_rtz(int8);
	int8 __ovld __cnfn convert_int8_rtp(int8);
	int8 __ovld __cnfn convert_int8_sat_rtp(int8);
	int8 __ovld __cnfn convert_int8_rtn(int8);
	int8 __ovld __cnfn convert_int8_sat_rtn(int8);
	int8 __ovld __cnfn convert_int8(int8);
	int8 __ovld __cnfn convert_int8_sat(int8);
	int8 __ovld __cnfn convert_int8_rte(uint8);
	int8 __ovld __cnfn convert_int8_sat_rte(uint8);
	int8 __ovld __cnfn convert_int8_rtz(uint8);
	int8 __ovld __cnfn convert_int8_sat_rtz(uint8);
	int8 __ovld __cnfn convert_int8_rtp(uint8);
	int8 __ovld __cnfn convert_int8_sat_rtp(uint8);
	int8 __ovld __cnfn convert_int8_rtn(uint8);
	int8 __ovld __cnfn convert_int8_sat_rtn(uint8);
	int8 __ovld __cnfn convert_int8(uint8);
	int8 __ovld __cnfn convert_int8_sat(uint8);
	int8 __ovld __cnfn convert_int8_rte(long8);
	int8 __ovld __cnfn convert_int8_sat_rte(long8);
	int8 __ovld __cnfn convert_int8_rtz(long8);
	int8 __ovld __cnfn convert_int8_sat_rtz(long8);
	int8 __ovld __cnfn convert_int8_rtp(long8);
	int8 __ovld __cnfn convert_int8_sat_rtp(long8);
	int8 __ovld __cnfn convert_int8_rtn(long8);
	int8 __ovld __cnfn convert_int8_sat_rtn(long8);
	int8 __ovld __cnfn convert_int8(long8);
	int8 __ovld __cnfn convert_int8_sat(long8);
	int8 __ovld __cnfn convert_int8_rte(ulong8);
	int8 __ovld __cnfn convert_int8_sat_rte(ulong8);
	int8 __ovld __cnfn convert_int8_rtz(ulong8);
	int8 __ovld __cnfn convert_int8_sat_rtz(ulong8);
	int8 __ovld __cnfn convert_int8_rtp(ulong8);
	int8 __ovld __cnfn convert_int8_sat_rtp(ulong8);
	int8 __ovld __cnfn convert_int8_rtn(ulong8);
	int8 __ovld __cnfn convert_int8_sat_rtn(ulong8);
	int8 __ovld __cnfn convert_int8(ulong8);
	int8 __ovld __cnfn convert_int8_sat(ulong8);
	int8 __ovld __cnfn convert_int8_rte(float8);
	int8 __ovld __cnfn convert_int8_sat_rte(float8);
	int8 __ovld __cnfn convert_int8_rtz(float8);
	int8 __ovld __cnfn convert_int8_sat_rtz(float8);
	int8 __ovld __cnfn convert_int8_rtp(float8);
	int8 __ovld __cnfn convert_int8_sat_rtp(float8);
	int8 __ovld __cnfn convert_int8_rtn(float8);
	int8 __ovld __cnfn convert_int8_sat_rtn(float8);
	int8 __ovld __cnfn convert_int8(float8);
	int8 __ovld __cnfn convert_int8_sat(float8);
	uint8 __ovld __cnfn convert_uint8_rte(char8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(char8);
	uint8 __ovld __cnfn convert_uint8_rtz(char8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(char8);
	uint8 __ovld __cnfn convert_uint8_rtp(char8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(char8);
	uint8 __ovld __cnfn convert_uint8_rtn(char8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(char8);
	uint8 __ovld __cnfn convert_uint8(char8);
	uint8 __ovld __cnfn convert_uint8_sat(char8);
	uint8 __ovld __cnfn convert_uint8_rte(uchar8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(uchar8);
	uint8 __ovld __cnfn convert_uint8_rtz(uchar8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(uchar8);
	uint8 __ovld __cnfn convert_uint8_rtp(uchar8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(uchar8);
	uint8 __ovld __cnfn convert_uint8_rtn(uchar8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(uchar8);
	uint8 __ovld __cnfn convert_uint8(uchar8);
	uint8 __ovld __cnfn convert_uint8_sat(uchar8);
	uint8 __ovld __cnfn convert_uint8_rte(short8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(short8);
	uint8 __ovld __cnfn convert_uint8_rtz(short8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(short8);
	uint8 __ovld __cnfn convert_uint8_rtp(short8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(short8);
	uint8 __ovld __cnfn convert_uint8_rtn(short8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(short8);
	uint8 __ovld __cnfn convert_uint8(short8);
	uint8 __ovld __cnfn convert_uint8_sat(short8);
	uint8 __ovld __cnfn convert_uint8_rte(ushort8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(ushort8);
	uint8 __ovld __cnfn convert_uint8_rtz(ushort8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(ushort8);
	uint8 __ovld __cnfn convert_uint8_rtp(ushort8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(ushort8);
	uint8 __ovld __cnfn convert_uint8_rtn(ushort8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(ushort8);
	uint8 __ovld __cnfn convert_uint8(ushort8);
	uint8 __ovld __cnfn convert_uint8_sat(ushort8);
	uint8 __ovld __cnfn convert_uint8_rte(int8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(int8);
	uint8 __ovld __cnfn convert_uint8_rtz(int8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(int8);
	uint8 __ovld __cnfn convert_uint8_rtp(int8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(int8);
	uint8 __ovld __cnfn convert_uint8_rtn(int8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(int8);
	uint8 __ovld __cnfn convert_uint8(int8);
	uint8 __ovld __cnfn convert_uint8_sat(int8);
	uint8 __ovld __cnfn convert_uint8_rte(uint8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(uint8);
	uint8 __ovld __cnfn convert_uint8_rtz(uint8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(uint8);
	uint8 __ovld __cnfn convert_uint8_rtp(uint8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(uint8);
	uint8 __ovld __cnfn convert_uint8_rtn(uint8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(uint8);
	uint8 __ovld __cnfn convert_uint8(uint8);
	uint8 __ovld __cnfn convert_uint8_sat(uint8);
	uint8 __ovld __cnfn convert_uint8_rte(long8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(long8);
	uint8 __ovld __cnfn convert_uint8_rtz(long8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(long8);
	uint8 __ovld __cnfn convert_uint8_rtp(long8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(long8);
	uint8 __ovld __cnfn convert_uint8_rtn(long8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(long8);
	uint8 __ovld __cnfn convert_uint8(long8);
	uint8 __ovld __cnfn convert_uint8_sat(long8);
	uint8 __ovld __cnfn convert_uint8_rte(ulong8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(ulong8);
	uint8 __ovld __cnfn convert_uint8_rtz(ulong8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(ulong8);
	uint8 __ovld __cnfn convert_uint8_rtp(ulong8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(ulong8);
	uint8 __ovld __cnfn convert_uint8_rtn(ulong8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(ulong8);
	uint8 __ovld __cnfn convert_uint8(ulong8);
	uint8 __ovld __cnfn convert_uint8_sat(ulong8);
	uint8 __ovld __cnfn convert_uint8_rte(float8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(float8);
	uint8 __ovld __cnfn convert_uint8_rtz(float8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(float8);
	uint8 __ovld __cnfn convert_uint8_rtp(float8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(float8);
	uint8 __ovld __cnfn convert_uint8_rtn(float8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(float8);
	uint8 __ovld __cnfn convert_uint8(float8);
	uint8 __ovld __cnfn convert_uint8_sat(float8);
	long8 __ovld __cnfn convert_long8_rte(char8);
	long8 __ovld __cnfn convert_long8_sat_rte(char8);
	long8 __ovld __cnfn convert_long8_rtz(char8);
	long8 __ovld __cnfn convert_long8_sat_rtz(char8);
	long8 __ovld __cnfn convert_long8_rtp(char8);
	long8 __ovld __cnfn convert_long8_sat_rtp(char8);
	long8 __ovld __cnfn convert_long8_rtn(char8);
	long8 __ovld __cnfn convert_long8_sat_rtn(char8);
	long8 __ovld __cnfn convert_long8(char8);
	long8 __ovld __cnfn convert_long8_sat(char8);
	long8 __ovld __cnfn convert_long8_rte(uchar8);
	long8 __ovld __cnfn convert_long8_sat_rte(uchar8);
	long8 __ovld __cnfn convert_long8_rtz(uchar8);
	long8 __ovld __cnfn convert_long8_sat_rtz(uchar8);
	long8 __ovld __cnfn convert_long8_rtp(uchar8);
	long8 __ovld __cnfn convert_long8_sat_rtp(uchar8);
	long8 __ovld __cnfn convert_long8_rtn(uchar8);
	long8 __ovld __cnfn convert_long8_sat_rtn(uchar8);
	long8 __ovld __cnfn convert_long8(uchar8);
	long8 __ovld __cnfn convert_long8_sat(uchar8);
	long8 __ovld __cnfn convert_long8_rte(short8);
	long8 __ovld __cnfn convert_long8_sat_rte(short8);
	long8 __ovld __cnfn convert_long8_rtz(short8);
	long8 __ovld __cnfn convert_long8_sat_rtz(short8);
	long8 __ovld __cnfn convert_long8_rtp(short8);
	long8 __ovld __cnfn convert_long8_sat_rtp(short8);
	long8 __ovld __cnfn convert_long8_rtn(short8);
	long8 __ovld __cnfn convert_long8_sat_rtn(short8);
	long8 __ovld __cnfn convert_long8(short8);
	long8 __ovld __cnfn convert_long8_sat(short8);
	long8 __ovld __cnfn convert_long8_rte(ushort8);
	long8 __ovld __cnfn convert_long8_sat_rte(ushort8);
	long8 __ovld __cnfn convert_long8_rtz(ushort8);
	long8 __ovld __cnfn convert_long8_sat_rtz(ushort8);
	long8 __ovld __cnfn convert_long8_rtp(ushort8);
	long8 __ovld __cnfn convert_long8_sat_rtp(ushort8);
	long8 __ovld __cnfn convert_long8_rtn(ushort8);
	long8 __ovld __cnfn convert_long8_sat_rtn(ushort8);
	long8 __ovld __cnfn convert_long8(ushort8);
	long8 __ovld __cnfn convert_long8_sat(ushort8);
	long8 __ovld __cnfn convert_long8_rte(int8);
	long8 __ovld __cnfn convert_long8_sat_rte(int8);
	long8 __ovld __cnfn convert_long8_rtz(int8);
	long8 __ovld __cnfn convert_long8_sat_rtz(int8);
	long8 __ovld __cnfn convert_long8_rtp(int8);
	long8 __ovld __cnfn convert_long8_sat_rtp(int8);
	long8 __ovld __cnfn convert_long8_rtn(int8);
	long8 __ovld __cnfn convert_long8_sat_rtn(int8);
	long8 __ovld __cnfn convert_long8(int8);
	long8 __ovld __cnfn convert_long8_sat(int8);
	long8 __ovld __cnfn convert_long8_rte(uint8);
	long8 __ovld __cnfn convert_long8_sat_rte(uint8);
	long8 __ovld __cnfn convert_long8_rtz(uint8);
	long8 __ovld __cnfn convert_long8_sat_rtz(uint8);
	long8 __ovld __cnfn convert_long8_rtp(uint8);
	long8 __ovld __cnfn convert_long8_sat_rtp(uint8);
	long8 __ovld __cnfn convert_long8_rtn(uint8);
	long8 __ovld __cnfn convert_long8_sat_rtn(uint8);
	long8 __ovld __cnfn convert_long8(uint8);
	long8 __ovld __cnfn convert_long8_sat(uint8);
	long8 __ovld __cnfn convert_long8_rte(long8);
	long8 __ovld __cnfn convert_long8_sat_rte(long8);
	long8 __ovld __cnfn convert_long8_rtz(long8);
	long8 __ovld __cnfn convert_long8_sat_rtz(long8);
	long8 __ovld __cnfn convert_long8_rtp(long8);
	long8 __ovld __cnfn convert_long8_sat_rtp(long8);
	long8 __ovld __cnfn convert_long8_rtn(long8);
	long8 __ovld __cnfn convert_long8_sat_rtn(long8);
	long8 __ovld __cnfn convert_long8(long8);
	long8 __ovld __cnfn convert_long8_sat(long8);
	long8 __ovld __cnfn convert_long8_rte(ulong8);
	long8 __ovld __cnfn convert_long8_sat_rte(ulong8);
	long8 __ovld __cnfn convert_long8_rtz(ulong8);
	long8 __ovld __cnfn convert_long8_sat_rtz(ulong8);
	long8 __ovld __cnfn convert_long8_rtp(ulong8);
	long8 __ovld __cnfn convert_long8_sat_rtp(ulong8);
	long8 __ovld __cnfn convert_long8_rtn(ulong8);
	long8 __ovld __cnfn convert_long8_sat_rtn(ulong8);
	long8 __ovld __cnfn convert_long8(ulong8);
	long8 __ovld __cnfn convert_long8_sat(ulong8);
	long8 __ovld __cnfn convert_long8_rte(float8);
	long8 __ovld __cnfn convert_long8_sat_rte(float8);
	long8 __ovld __cnfn convert_long8_rtz(float8);
	long8 __ovld __cnfn convert_long8_sat_rtz(float8);
	long8 __ovld __cnfn convert_long8_rtp(float8);
	long8 __ovld __cnfn convert_long8_sat_rtp(float8);
	long8 __ovld __cnfn convert_long8_rtn(float8);
	long8 __ovld __cnfn convert_long8_sat_rtn(float8);
	long8 __ovld __cnfn convert_long8(float8);
	long8 __ovld __cnfn convert_long8_sat(float8);
	ulong8 __ovld __cnfn convert_ulong8_rte(char8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(char8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(char8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(char8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(char8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(char8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(char8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(char8);
	ulong8 __ovld __cnfn convert_ulong8(char8);
	ulong8 __ovld __cnfn convert_ulong8_sat(char8);
	ulong8 __ovld __cnfn convert_ulong8_rte(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(uchar8);
	ulong8 __ovld __cnfn convert_ulong8(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_sat(uchar8);
	ulong8 __ovld __cnfn convert_ulong8_rte(short8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(short8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(short8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(short8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(short8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(short8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(short8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(short8);
	ulong8 __ovld __cnfn convert_ulong8(short8);
	ulong8 __ovld __cnfn convert_ulong8_sat(short8);
	ulong8 __ovld __cnfn convert_ulong8_rte(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(ushort8);
	ulong8 __ovld __cnfn convert_ulong8(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_sat(ushort8);
	ulong8 __ovld __cnfn convert_ulong8_rte(int8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(int8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(int8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(int8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(int8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(int8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(int8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(int8);
	ulong8 __ovld __cnfn convert_ulong8(int8);
	ulong8 __ovld __cnfn convert_ulong8_sat(int8);
	ulong8 __ovld __cnfn convert_ulong8_rte(uint8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(uint8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(uint8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(uint8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(uint8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(uint8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(uint8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(uint8);
	ulong8 __ovld __cnfn convert_ulong8(uint8);
	ulong8 __ovld __cnfn convert_ulong8_sat(uint8);
	ulong8 __ovld __cnfn convert_ulong8_rte(long8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(long8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(long8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(long8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(long8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(long8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(long8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(long8);
	ulong8 __ovld __cnfn convert_ulong8(long8);
	ulong8 __ovld __cnfn convert_ulong8_sat(long8);
	ulong8 __ovld __cnfn convert_ulong8_rte(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(ulong8);
	ulong8 __ovld __cnfn convert_ulong8(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_sat(ulong8);
	ulong8 __ovld __cnfn convert_ulong8_rte(float8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(float8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(float8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(float8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(float8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(float8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(float8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(float8);
	ulong8 __ovld __cnfn convert_ulong8(float8);
	ulong8 __ovld __cnfn convert_ulong8_sat(float8);
	float8 __ovld __cnfn convert_float8_rte(char8);
	float8 __ovld __cnfn convert_float8_rtz(char8);
	float8 __ovld __cnfn convert_float8_rtp(char8);
	float8 __ovld __cnfn convert_float8_rtn(char8);
	float8 __ovld __cnfn convert_float8(char8);
	float8 __ovld __cnfn convert_float8_rte(uchar8);
	float8 __ovld __cnfn convert_float8_rtz(uchar8);
	float8 __ovld __cnfn convert_float8_rtp(uchar8);
	float8 __ovld __cnfn convert_float8_rtn(uchar8);
	float8 __ovld __cnfn convert_float8(uchar8);
	float8 __ovld __cnfn convert_float8_rte(short8);
	float8 __ovld __cnfn convert_float8_rtz(short8);
	float8 __ovld __cnfn convert_float8_rtp(short8);
	float8 __ovld __cnfn convert_float8_rtn(short8);
	float8 __ovld __cnfn convert_float8(short8);
	float8 __ovld __cnfn convert_float8_rte(ushort8);
	float8 __ovld __cnfn convert_float8_rtz(ushort8);
	float8 __ovld __cnfn convert_float8_rtp(ushort8);
	float8 __ovld __cnfn convert_float8_rtn(ushort8);
	float8 __ovld __cnfn convert_float8(ushort8);
	float8 __ovld __cnfn convert_float8_rte(int8);
	float8 __ovld __cnfn convert_float8_rtz(int8);
	float8 __ovld __cnfn convert_float8_rtp(int8);
	float8 __ovld __cnfn convert_float8_rtn(int8);
	float8 __ovld __cnfn convert_float8(int8);
	float8 __ovld __cnfn convert_float8_rte(uint8);
	float8 __ovld __cnfn convert_float8_rtz(uint8);
	float8 __ovld __cnfn convert_float8_rtp(uint8);
	float8 __ovld __cnfn convert_float8_rtn(uint8);
	float8 __ovld __cnfn convert_float8(uint8);
	float8 __ovld __cnfn convert_float8_rte(long8);
	float8 __ovld __cnfn convert_float8_rtz(long8);
	float8 __ovld __cnfn convert_float8_rtp(long8);
	float8 __ovld __cnfn convert_float8_rtn(long8);
	float8 __ovld __cnfn convert_float8(long8);
	float8 __ovld __cnfn convert_float8_rte(ulong8);
	float8 __ovld __cnfn convert_float8_rtz(ulong8);
	float8 __ovld __cnfn convert_float8_rtp(ulong8);
	float8 __ovld __cnfn convert_float8_rtn(ulong8);
	float8 __ovld __cnfn convert_float8(ulong8);
	float8 __ovld __cnfn convert_float8_rte(float8);
	float8 __ovld __cnfn convert_float8_rtz(float8);
	float8 __ovld __cnfn convert_float8_rtp(float8);
	float8 __ovld __cnfn convert_float8_rtn(float8);
	float8 __ovld __cnfn convert_float8(float8);
	char16 __ovld __cnfn convert_char16_rte(char16);
	char16 __ovld __cnfn convert_char16_sat_rte(char16);
	char16 __ovld __cnfn convert_char16_rtz(char16);
	char16 __ovld __cnfn convert_char16_sat_rtz(char16);
	char16 __ovld __cnfn convert_char16_rtp(char16);
	char16 __ovld __cnfn convert_char16_sat_rtp(char16);
	char16 __ovld __cnfn convert_char16_rtn(char16);
	char16 __ovld __cnfn convert_char16_sat_rtn(char16);
	char16 __ovld __cnfn convert_char16(char16);
	char16 __ovld __cnfn convert_char16_sat(char16);
	char16 __ovld __cnfn convert_char16_rte(uchar16);
	char16 __ovld __cnfn convert_char16_sat_rte(uchar16);
	char16 __ovld __cnfn convert_char16_rtz(uchar16);
	char16 __ovld __cnfn convert_char16_sat_rtz(uchar16);
	char16 __ovld __cnfn convert_char16_rtp(uchar16);
	char16 __ovld __cnfn convert_char16_sat_rtp(uchar16);
	char16 __ovld __cnfn convert_char16_rtn(uchar16);
	char16 __ovld __cnfn convert_char16_sat_rtn(uchar16);
	char16 __ovld __cnfn convert_char16(uchar16);
	char16 __ovld __cnfn convert_char16_sat(uchar16);
	char16 __ovld __cnfn convert_char16_rte(short16);
	char16 __ovld __cnfn convert_char16_sat_rte(short16);
	char16 __ovld __cnfn convert_char16_rtz(short16);
	char16 __ovld __cnfn convert_char16_sat_rtz(short16);
	char16 __ovld __cnfn convert_char16_rtp(short16);
	char16 __ovld __cnfn convert_char16_sat_rtp(short16);
	char16 __ovld __cnfn convert_char16_rtn(short16);
	char16 __ovld __cnfn convert_char16_sat_rtn(short16);
	char16 __ovld __cnfn convert_char16(short16);
	char16 __ovld __cnfn convert_char16_sat(short16);
	char16 __ovld __cnfn convert_char16_rte(ushort16);
	char16 __ovld __cnfn convert_char16_sat_rte(ushort16);
	char16 __ovld __cnfn convert_char16_rtz(ushort16);
	char16 __ovld __cnfn convert_char16_sat_rtz(ushort16);
	char16 __ovld __cnfn convert_char16_rtp(ushort16);
	char16 __ovld __cnfn convert_char16_sat_rtp(ushort16);
	char16 __ovld __cnfn convert_char16_rtn(ushort16);
	char16 __ovld __cnfn convert_char16_sat_rtn(ushort16);
	char16 __ovld __cnfn convert_char16(ushort16);
	char16 __ovld __cnfn convert_char16_sat(ushort16);
	char16 __ovld __cnfn convert_char16_rte(int16);
	char16 __ovld __cnfn convert_char16_sat_rte(int16);
	char16 __ovld __cnfn convert_char16_rtz(int16);
	char16 __ovld __cnfn convert_char16_sat_rtz(int16);
	char16 __ovld __cnfn convert_char16_rtp(int16);
	char16 __ovld __cnfn convert_char16_sat_rtp(int16);
	char16 __ovld __cnfn convert_char16_rtn(int16);
	char16 __ovld __cnfn convert_char16_sat_rtn(int16);
	char16 __ovld __cnfn convert_char16(int16);
	char16 __ovld __cnfn convert_char16_sat(int16);
	char16 __ovld __cnfn convert_char16_rte(uint16);
	char16 __ovld __cnfn convert_char16_sat_rte(uint16);
	char16 __ovld __cnfn convert_char16_rtz(uint16);
	char16 __ovld __cnfn convert_char16_sat_rtz(uint16);
	char16 __ovld __cnfn convert_char16_rtp(uint16);
	char16 __ovld __cnfn convert_char16_sat_rtp(uint16);
	char16 __ovld __cnfn convert_char16_rtn(uint16);
	char16 __ovld __cnfn convert_char16_sat_rtn(uint16);
	char16 __ovld __cnfn convert_char16(uint16);
	char16 __ovld __cnfn convert_char16_sat(uint16);
	char16 __ovld __cnfn convert_char16_rte(long16);
	char16 __ovld __cnfn convert_char16_sat_rte(long16);
	char16 __ovld __cnfn convert_char16_rtz(long16);
	char16 __ovld __cnfn convert_char16_sat_rtz(long16);
	char16 __ovld __cnfn convert_char16_rtp(long16);
	char16 __ovld __cnfn convert_char16_sat_rtp(long16);
	char16 __ovld __cnfn convert_char16_rtn(long16);
	char16 __ovld __cnfn convert_char16_sat_rtn(long16);
	char16 __ovld __cnfn convert_char16(long16);
	char16 __ovld __cnfn convert_char16_sat(long16);
	char16 __ovld __cnfn convert_char16_rte(ulong16);
	char16 __ovld __cnfn convert_char16_sat_rte(ulong16);
	char16 __ovld __cnfn convert_char16_rtz(ulong16);
	char16 __ovld __cnfn convert_char16_sat_rtz(ulong16);
	char16 __ovld __cnfn convert_char16_rtp(ulong16);
	char16 __ovld __cnfn convert_char16_sat_rtp(ulong16);
	char16 __ovld __cnfn convert_char16_rtn(ulong16);
	char16 __ovld __cnfn convert_char16_sat_rtn(ulong16);
	char16 __ovld __cnfn convert_char16(ulong16);
	char16 __ovld __cnfn convert_char16_sat(ulong16);
	char16 __ovld __cnfn convert_char16_rte(float16);
	char16 __ovld __cnfn convert_char16_sat_rte(float16);
	char16 __ovld __cnfn convert_char16_rtz(float16);
	char16 __ovld __cnfn convert_char16_sat_rtz(float16);
	char16 __ovld __cnfn convert_char16_rtp(float16);
	char16 __ovld __cnfn convert_char16_sat_rtp(float16);
	char16 __ovld __cnfn convert_char16_rtn(float16);
	char16 __ovld __cnfn convert_char16_sat_rtn(float16);
	char16 __ovld __cnfn convert_char16(float16);
	char16 __ovld __cnfn convert_char16_sat(float16);
	uchar16 __ovld __cnfn convert_uchar16_rte(char16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(char16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(char16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(char16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(char16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(char16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(char16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(char16);
	uchar16 __ovld __cnfn convert_uchar16(char16);
	uchar16 __ovld __cnfn convert_uchar16_sat(char16);
	uchar16 __ovld __cnfn convert_uchar16_rte(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(uchar16);
	uchar16 __ovld __cnfn convert_uchar16(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_sat(uchar16);
	uchar16 __ovld __cnfn convert_uchar16_rte(short16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(short16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(short16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(short16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(short16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(short16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(short16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(short16);
	uchar16 __ovld __cnfn convert_uchar16(short16);
	uchar16 __ovld __cnfn convert_uchar16_sat(short16);
	uchar16 __ovld __cnfn convert_uchar16_rte(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(ushort16);
	uchar16 __ovld __cnfn convert_uchar16(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_sat(ushort16);
	uchar16 __ovld __cnfn convert_uchar16_rte(int16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(int16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(int16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(int16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(int16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(int16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(int16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(int16);
	uchar16 __ovld __cnfn convert_uchar16(int16);
	uchar16 __ovld __cnfn convert_uchar16_sat(int16);
	uchar16 __ovld __cnfn convert_uchar16_rte(uint16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(uint16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(uint16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(uint16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(uint16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(uint16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(uint16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(uint16);
	uchar16 __ovld __cnfn convert_uchar16(uint16);
	uchar16 __ovld __cnfn convert_uchar16_sat(uint16);
	uchar16 __ovld __cnfn convert_uchar16_rte(long16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(long16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(long16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(long16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(long16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(long16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(long16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(long16);
	uchar16 __ovld __cnfn convert_uchar16(long16);
	uchar16 __ovld __cnfn convert_uchar16_sat(long16);
	uchar16 __ovld __cnfn convert_uchar16_rte(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(ulong16);
	uchar16 __ovld __cnfn convert_uchar16(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_sat(ulong16);
	uchar16 __ovld __cnfn convert_uchar16_rte(float16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(float16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(float16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(float16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(float16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(float16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(float16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(float16);
	uchar16 __ovld __cnfn convert_uchar16(float16);
	uchar16 __ovld __cnfn convert_uchar16_sat(float16);
	short16 __ovld __cnfn convert_short16_rte(char16);
	short16 __ovld __cnfn convert_short16_sat_rte(char16);
	short16 __ovld __cnfn convert_short16_rtz(char16);
	short16 __ovld __cnfn convert_short16_sat_rtz(char16);
	short16 __ovld __cnfn convert_short16_rtp(char16);
	short16 __ovld __cnfn convert_short16_sat_rtp(char16);
	short16 __ovld __cnfn convert_short16_rtn(char16);
	short16 __ovld __cnfn convert_short16_sat_rtn(char16);
	short16 __ovld __cnfn convert_short16(char16);
	short16 __ovld __cnfn convert_short16_sat(char16);
	short16 __ovld __cnfn convert_short16_rte(uchar16);
	short16 __ovld __cnfn convert_short16_sat_rte(uchar16);
	short16 __ovld __cnfn convert_short16_rtz(uchar16);
	short16 __ovld __cnfn convert_short16_sat_rtz(uchar16);
	short16 __ovld __cnfn convert_short16_rtp(uchar16);
	short16 __ovld __cnfn convert_short16_sat_rtp(uchar16);
	short16 __ovld __cnfn convert_short16_rtn(uchar16);
	short16 __ovld __cnfn convert_short16_sat_rtn(uchar16);
	short16 __ovld __cnfn convert_short16(uchar16);
	short16 __ovld __cnfn convert_short16_sat(uchar16);
	short16 __ovld __cnfn convert_short16_rte(short16);
	short16 __ovld __cnfn convert_short16_sat_rte(short16);
	short16 __ovld __cnfn convert_short16_rtz(short16);
	short16 __ovld __cnfn convert_short16_sat_rtz(short16);
	short16 __ovld __cnfn convert_short16_rtp(short16);
	short16 __ovld __cnfn convert_short16_sat_rtp(short16);
	short16 __ovld __cnfn convert_short16_rtn(short16);
	short16 __ovld __cnfn convert_short16_sat_rtn(short16);
	short16 __ovld __cnfn convert_short16(short16);
	short16 __ovld __cnfn convert_short16_sat(short16);
	short16 __ovld __cnfn convert_short16_rte(ushort16);
	short16 __ovld __cnfn convert_short16_sat_rte(ushort16);
	short16 __ovld __cnfn convert_short16_rtz(ushort16);
	short16 __ovld __cnfn convert_short16_sat_rtz(ushort16);
	short16 __ovld __cnfn convert_short16_rtp(ushort16);
	short16 __ovld __cnfn convert_short16_sat_rtp(ushort16);
	short16 __ovld __cnfn convert_short16_rtn(ushort16);
	short16 __ovld __cnfn convert_short16_sat_rtn(ushort16);
	short16 __ovld __cnfn convert_short16(ushort16);
	short16 __ovld __cnfn convert_short16_sat(ushort16);
	short16 __ovld __cnfn convert_short16_rte(int16);
	short16 __ovld __cnfn convert_short16_sat_rte(int16);
	short16 __ovld __cnfn convert_short16_rtz(int16);
	short16 __ovld __cnfn convert_short16_sat_rtz(int16);
	short16 __ovld __cnfn convert_short16_rtp(int16);
	short16 __ovld __cnfn convert_short16_sat_rtp(int16);
	short16 __ovld __cnfn convert_short16_rtn(int16);
	short16 __ovld __cnfn convert_short16_sat_rtn(int16);
	short16 __ovld __cnfn convert_short16(int16);
	short16 __ovld __cnfn convert_short16_sat(int16);
	short16 __ovld __cnfn convert_short16_rte(uint16);
	short16 __ovld __cnfn convert_short16_sat_rte(uint16);
	short16 __ovld __cnfn convert_short16_rtz(uint16);
	short16 __ovld __cnfn convert_short16_sat_rtz(uint16);
	short16 __ovld __cnfn convert_short16_rtp(uint16);
	short16 __ovld __cnfn convert_short16_sat_rtp(uint16);
	short16 __ovld __cnfn convert_short16_rtn(uint16);
	short16 __ovld __cnfn convert_short16_sat_rtn(uint16);
	short16 __ovld __cnfn convert_short16(uint16);
	short16 __ovld __cnfn convert_short16_sat(uint16);
	short16 __ovld __cnfn convert_short16_rte(long16);
	short16 __ovld __cnfn convert_short16_sat_rte(long16);
	short16 __ovld __cnfn convert_short16_rtz(long16);
	short16 __ovld __cnfn convert_short16_sat_rtz(long16);
	short16 __ovld __cnfn convert_short16_rtp(long16);
	short16 __ovld __cnfn convert_short16_sat_rtp(long16);
	short16 __ovld __cnfn convert_short16_rtn(long16);
	short16 __ovld __cnfn convert_short16_sat_rtn(long16);
	short16 __ovld __cnfn convert_short16(long16);
	short16 __ovld __cnfn convert_short16_sat(long16);
	short16 __ovld __cnfn convert_short16_rte(ulong16);
	short16 __ovld __cnfn convert_short16_sat_rte(ulong16);
	short16 __ovld __cnfn convert_short16_rtz(ulong16);
	short16 __ovld __cnfn convert_short16_sat_rtz(ulong16);
	short16 __ovld __cnfn convert_short16_rtp(ulong16);
	short16 __ovld __cnfn convert_short16_sat_rtp(ulong16);
	short16 __ovld __cnfn convert_short16_rtn(ulong16);
	short16 __ovld __cnfn convert_short16_sat_rtn(ulong16);
	short16 __ovld __cnfn convert_short16(ulong16);
	short16 __ovld __cnfn convert_short16_sat(ulong16);
	short16 __ovld __cnfn convert_short16_rte(float16);
	short16 __ovld __cnfn convert_short16_sat_rte(float16);
	short16 __ovld __cnfn convert_short16_rtz(float16);
	short16 __ovld __cnfn convert_short16_sat_rtz(float16);
	short16 __ovld __cnfn convert_short16_rtp(float16);
	short16 __ovld __cnfn convert_short16_sat_rtp(float16);
	short16 __ovld __cnfn convert_short16_rtn(float16);
	short16 __ovld __cnfn convert_short16_sat_rtn(float16);
	short16 __ovld __cnfn convert_short16(float16);
	short16 __ovld __cnfn convert_short16_sat(float16);
	ushort16 __ovld __cnfn convert_ushort16_rte(char16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(char16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(char16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(char16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(char16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(char16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(char16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(char16);
	ushort16 __ovld __cnfn convert_ushort16(char16);
	ushort16 __ovld __cnfn convert_ushort16_sat(char16);
	ushort16 __ovld __cnfn convert_ushort16_rte(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(uchar16);
	ushort16 __ovld __cnfn convert_ushort16(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_sat(uchar16);
	ushort16 __ovld __cnfn convert_ushort16_rte(short16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(short16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(short16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(short16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(short16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(short16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(short16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(short16);
	ushort16 __ovld __cnfn convert_ushort16(short16);
	ushort16 __ovld __cnfn convert_ushort16_sat(short16);
	ushort16 __ovld __cnfn convert_ushort16_rte(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(ushort16);
	ushort16 __ovld __cnfn convert_ushort16(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_sat(ushort16);
	ushort16 __ovld __cnfn convert_ushort16_rte(int16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(int16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(int16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(int16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(int16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(int16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(int16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(int16);
	ushort16 __ovld __cnfn convert_ushort16(int16);
	ushort16 __ovld __cnfn convert_ushort16_sat(int16);
	ushort16 __ovld __cnfn convert_ushort16_rte(uint16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(uint16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(uint16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(uint16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(uint16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(uint16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(uint16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(uint16);
	ushort16 __ovld __cnfn convert_ushort16(uint16);
	ushort16 __ovld __cnfn convert_ushort16_sat(uint16);
	ushort16 __ovld __cnfn convert_ushort16_rte(long16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(long16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(long16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(long16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(long16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(long16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(long16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(long16);
	ushort16 __ovld __cnfn convert_ushort16(long16);
	ushort16 __ovld __cnfn convert_ushort16_sat(long16);
	ushort16 __ovld __cnfn convert_ushort16_rte(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(ulong16);
	ushort16 __ovld __cnfn convert_ushort16(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_sat(ulong16);
	ushort16 __ovld __cnfn convert_ushort16_rte(float16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(float16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(float16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(float16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(float16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(float16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(float16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(float16);
	ushort16 __ovld __cnfn convert_ushort16(float16);
	ushort16 __ovld __cnfn convert_ushort16_sat(float16);
	int16 __ovld __cnfn convert_int16_rte(char16);
	int16 __ovld __cnfn convert_int16_sat_rte(char16);
	int16 __ovld __cnfn convert_int16_rtz(char16);
	int16 __ovld __cnfn convert_int16_sat_rtz(char16);
	int16 __ovld __cnfn convert_int16_rtp(char16);
	int16 __ovld __cnfn convert_int16_sat_rtp(char16);
	int16 __ovld __cnfn convert_int16_rtn(char16);
	int16 __ovld __cnfn convert_int16_sat_rtn(char16);
	int16 __ovld __cnfn convert_int16(char16);
	int16 __ovld __cnfn convert_int16_sat(char16);
	int16 __ovld __cnfn convert_int16_rte(uchar16);
	int16 __ovld __cnfn convert_int16_sat_rte(uchar16);
	int16 __ovld __cnfn convert_int16_rtz(uchar16);
	int16 __ovld __cnfn convert_int16_sat_rtz(uchar16);
	int16 __ovld __cnfn convert_int16_rtp(uchar16);
	int16 __ovld __cnfn convert_int16_sat_rtp(uchar16);
	int16 __ovld __cnfn convert_int16_rtn(uchar16);
	int16 __ovld __cnfn convert_int16_sat_rtn(uchar16);
	int16 __ovld __cnfn convert_int16(uchar16);
	int16 __ovld __cnfn convert_int16_sat(uchar16);
	int16 __ovld __cnfn convert_int16_rte(short16);
	int16 __ovld __cnfn convert_int16_sat_rte(short16);
	int16 __ovld __cnfn convert_int16_rtz(short16);
	int16 __ovld __cnfn convert_int16_sat_rtz(short16);
	int16 __ovld __cnfn convert_int16_rtp(short16);
	int16 __ovld __cnfn convert_int16_sat_rtp(short16);
	int16 __ovld __cnfn convert_int16_rtn(short16);
	int16 __ovld __cnfn convert_int16_sat_rtn(short16);
	int16 __ovld __cnfn convert_int16(short16);
	int16 __ovld __cnfn convert_int16_sat(short16);
	int16 __ovld __cnfn convert_int16_rte(ushort16);
	int16 __ovld __cnfn convert_int16_sat_rte(ushort16);
	int16 __ovld __cnfn convert_int16_rtz(ushort16);
	int16 __ovld __cnfn convert_int16_sat_rtz(ushort16);
	int16 __ovld __cnfn convert_int16_rtp(ushort16);
	int16 __ovld __cnfn convert_int16_sat_rtp(ushort16);
	int16 __ovld __cnfn convert_int16_rtn(ushort16);
	int16 __ovld __cnfn convert_int16_sat_rtn(ushort16);
	int16 __ovld __cnfn convert_int16(ushort16);
	int16 __ovld __cnfn convert_int16_sat(ushort16);
	int16 __ovld __cnfn convert_int16_rte(int16);
	int16 __ovld __cnfn convert_int16_sat_rte(int16);
	int16 __ovld __cnfn convert_int16_rtz(int16);
	int16 __ovld __cnfn convert_int16_sat_rtz(int16);
	int16 __ovld __cnfn convert_int16_rtp(int16);
	int16 __ovld __cnfn convert_int16_sat_rtp(int16);
	int16 __ovld __cnfn convert_int16_rtn(int16);
	int16 __ovld __cnfn convert_int16_sat_rtn(int16);
	int16 __ovld __cnfn convert_int16(int16);
	int16 __ovld __cnfn convert_int16_sat(int16);
	int16 __ovld __cnfn convert_int16_rte(uint16);
	int16 __ovld __cnfn convert_int16_sat_rte(uint16);
	int16 __ovld __cnfn convert_int16_rtz(uint16);
	int16 __ovld __cnfn convert_int16_sat_rtz(uint16);
	int16 __ovld __cnfn convert_int16_rtp(uint16);
	int16 __ovld __cnfn convert_int16_sat_rtp(uint16);
	int16 __ovld __cnfn convert_int16_rtn(uint16);
	int16 __ovld __cnfn convert_int16_sat_rtn(uint16);
	int16 __ovld __cnfn convert_int16(uint16);
	int16 __ovld __cnfn convert_int16_sat(uint16);
	int16 __ovld __cnfn convert_int16_rte(long16);
	int16 __ovld __cnfn convert_int16_sat_rte(long16);
	int16 __ovld __cnfn convert_int16_rtz(long16);
	int16 __ovld __cnfn convert_int16_sat_rtz(long16);
	int16 __ovld __cnfn convert_int16_rtp(long16);
	int16 __ovld __cnfn convert_int16_sat_rtp(long16);
	int16 __ovld __cnfn convert_int16_rtn(long16);
	int16 __ovld __cnfn convert_int16_sat_rtn(long16);
	int16 __ovld __cnfn convert_int16(long16);
	int16 __ovld __cnfn convert_int16_sat(long16);
	int16 __ovld __cnfn convert_int16_rte(ulong16);
	int16 __ovld __cnfn convert_int16_sat_rte(ulong16);
	int16 __ovld __cnfn convert_int16_rtz(ulong16);
	int16 __ovld __cnfn convert_int16_sat_rtz(ulong16);
	int16 __ovld __cnfn convert_int16_rtp(ulong16);
	int16 __ovld __cnfn convert_int16_sat_rtp(ulong16);
	int16 __ovld __cnfn convert_int16_rtn(ulong16);
	int16 __ovld __cnfn convert_int16_sat_rtn(ulong16);
	int16 __ovld __cnfn convert_int16(ulong16);
	int16 __ovld __cnfn convert_int16_sat(ulong16);
	int16 __ovld __cnfn convert_int16_rte(float16);
	int16 __ovld __cnfn convert_int16_sat_rte(float16);
	int16 __ovld __cnfn convert_int16_rtz(float16);
	int16 __ovld __cnfn convert_int16_sat_rtz(float16);
	int16 __ovld __cnfn convert_int16_rtp(float16);
	int16 __ovld __cnfn convert_int16_sat_rtp(float16);
	int16 __ovld __cnfn convert_int16_rtn(float16);
	int16 __ovld __cnfn convert_int16_sat_rtn(float16);
	int16 __ovld __cnfn convert_int16(float16);
	int16 __ovld __cnfn convert_int16_sat(float16);
	uint16 __ovld __cnfn convert_uint16_rte(char16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(char16);
	uint16 __ovld __cnfn convert_uint16_rtz(char16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(char16);
	uint16 __ovld __cnfn convert_uint16_rtp(char16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(char16);
	uint16 __ovld __cnfn convert_uint16_rtn(char16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(char16);
	uint16 __ovld __cnfn convert_uint16(char16);
	uint16 __ovld __cnfn convert_uint16_sat(char16);
	uint16 __ovld __cnfn convert_uint16_rte(uchar16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(uchar16);
	uint16 __ovld __cnfn convert_uint16_rtz(uchar16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(uchar16);
	uint16 __ovld __cnfn convert_uint16_rtp(uchar16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(uchar16);
	uint16 __ovld __cnfn convert_uint16_rtn(uchar16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(uchar16);
	uint16 __ovld __cnfn convert_uint16(uchar16);
	uint16 __ovld __cnfn convert_uint16_sat(uchar16);
	uint16 __ovld __cnfn convert_uint16_rte(short16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(short16);
	uint16 __ovld __cnfn convert_uint16_rtz(short16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(short16);
	uint16 __ovld __cnfn convert_uint16_rtp(short16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(short16);
	uint16 __ovld __cnfn convert_uint16_rtn(short16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(short16);
	uint16 __ovld __cnfn convert_uint16(short16);
	uint16 __ovld __cnfn convert_uint16_sat(short16);
	uint16 __ovld __cnfn convert_uint16_rte(ushort16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(ushort16);
	uint16 __ovld __cnfn convert_uint16_rtz(ushort16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(ushort16);
	uint16 __ovld __cnfn convert_uint16_rtp(ushort16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(ushort16);
	uint16 __ovld __cnfn convert_uint16_rtn(ushort16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(ushort16);
	uint16 __ovld __cnfn convert_uint16(ushort16);
	uint16 __ovld __cnfn convert_uint16_sat(ushort16);
	uint16 __ovld __cnfn convert_uint16_rte(int16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(int16);
	uint16 __ovld __cnfn convert_uint16_rtz(int16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(int16);
	uint16 __ovld __cnfn convert_uint16_rtp(int16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(int16);
	uint16 __ovld __cnfn convert_uint16_rtn(int16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(int16);
	uint16 __ovld __cnfn convert_uint16(int16);
	uint16 __ovld __cnfn convert_uint16_sat(int16);
	uint16 __ovld __cnfn convert_uint16_rte(uint16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(uint16);
	uint16 __ovld __cnfn convert_uint16_rtz(uint16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(uint16);
	uint16 __ovld __cnfn convert_uint16_rtp(uint16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(uint16);
	uint16 __ovld __cnfn convert_uint16_rtn(uint16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(uint16);
	uint16 __ovld __cnfn convert_uint16(uint16);
	uint16 __ovld __cnfn convert_uint16_sat(uint16);
	uint16 __ovld __cnfn convert_uint16_rte(long16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(long16);
	uint16 __ovld __cnfn convert_uint16_rtz(long16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(long16);
	uint16 __ovld __cnfn convert_uint16_rtp(long16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(long16);
	uint16 __ovld __cnfn convert_uint16_rtn(long16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(long16);
	uint16 __ovld __cnfn convert_uint16(long16);
	uint16 __ovld __cnfn convert_uint16_sat(long16);
	uint16 __ovld __cnfn convert_uint16_rte(ulong16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(ulong16);
	uint16 __ovld __cnfn convert_uint16_rtz(ulong16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(ulong16);
	uint16 __ovld __cnfn convert_uint16_rtp(ulong16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(ulong16);
	uint16 __ovld __cnfn convert_uint16_rtn(ulong16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(ulong16);
	uint16 __ovld __cnfn convert_uint16(ulong16);
	uint16 __ovld __cnfn convert_uint16_sat(ulong16);
	uint16 __ovld __cnfn convert_uint16_rte(float16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(float16);
	uint16 __ovld __cnfn convert_uint16_rtz(float16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(float16);
	uint16 __ovld __cnfn convert_uint16_rtp(float16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(float16);
	uint16 __ovld __cnfn convert_uint16_rtn(float16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(float16);
	uint16 __ovld __cnfn convert_uint16(float16);
	uint16 __ovld __cnfn convert_uint16_sat(float16);
	long16 __ovld __cnfn convert_long16_rte(char16);
	long16 __ovld __cnfn convert_long16_sat_rte(char16);
	long16 __ovld __cnfn convert_long16_rtz(char16);
	long16 __ovld __cnfn convert_long16_sat_rtz(char16);
	long16 __ovld __cnfn convert_long16_rtp(char16);
	long16 __ovld __cnfn convert_long16_sat_rtp(char16);
	long16 __ovld __cnfn convert_long16_rtn(char16);
	long16 __ovld __cnfn convert_long16_sat_rtn(char16);
	long16 __ovld __cnfn convert_long16(char16);
	long16 __ovld __cnfn convert_long16_sat(char16);
	long16 __ovld __cnfn convert_long16_rte(uchar16);
	long16 __ovld __cnfn convert_long16_sat_rte(uchar16);
	long16 __ovld __cnfn convert_long16_rtz(uchar16);
	long16 __ovld __cnfn convert_long16_sat_rtz(uchar16);
	long16 __ovld __cnfn convert_long16_rtp(uchar16);
	long16 __ovld __cnfn convert_long16_sat_rtp(uchar16);
	long16 __ovld __cnfn convert_long16_rtn(uchar16);
	long16 __ovld __cnfn convert_long16_sat_rtn(uchar16);
	long16 __ovld __cnfn convert_long16(uchar16);
	long16 __ovld __cnfn convert_long16_sat(uchar16);
	long16 __ovld __cnfn convert_long16_rte(short16);
	long16 __ovld __cnfn convert_long16_sat_rte(short16);
	long16 __ovld __cnfn convert_long16_rtz(short16);
	long16 __ovld __cnfn convert_long16_sat_rtz(short16);
	long16 __ovld __cnfn convert_long16_rtp(short16);
	long16 __ovld __cnfn convert_long16_sat_rtp(short16);
	long16 __ovld __cnfn convert_long16_rtn(short16);
	long16 __ovld __cnfn convert_long16_sat_rtn(short16);
	long16 __ovld __cnfn convert_long16(short16);
	long16 __ovld __cnfn convert_long16_sat(short16);
	long16 __ovld __cnfn convert_long16_rte(ushort16);
	long16 __ovld __cnfn convert_long16_sat_rte(ushort16);
	long16 __ovld __cnfn convert_long16_rtz(ushort16);
	long16 __ovld __cnfn convert_long16_sat_rtz(ushort16);
	long16 __ovld __cnfn convert_long16_rtp(ushort16);
	long16 __ovld __cnfn convert_long16_sat_rtp(ushort16);
	long16 __ovld __cnfn convert_long16_rtn(ushort16);
	long16 __ovld __cnfn convert_long16_sat_rtn(ushort16);
	long16 __ovld __cnfn convert_long16(ushort16);
	long16 __ovld __cnfn convert_long16_sat(ushort16);
	long16 __ovld __cnfn convert_long16_rte(int16);
	long16 __ovld __cnfn convert_long16_sat_rte(int16);
	long16 __ovld __cnfn convert_long16_rtz(int16);
	long16 __ovld __cnfn convert_long16_sat_rtz(int16);
	long16 __ovld __cnfn convert_long16_rtp(int16);
	long16 __ovld __cnfn convert_long16_sat_rtp(int16);
	long16 __ovld __cnfn convert_long16_rtn(int16);
	long16 __ovld __cnfn convert_long16_sat_rtn(int16);
	long16 __ovld __cnfn convert_long16(int16);
	long16 __ovld __cnfn convert_long16_sat(int16);
	long16 __ovld __cnfn convert_long16_rte(uint16);
	long16 __ovld __cnfn convert_long16_sat_rte(uint16);
	long16 __ovld __cnfn convert_long16_rtz(uint16);
	long16 __ovld __cnfn convert_long16_sat_rtz(uint16);
	long16 __ovld __cnfn convert_long16_rtp(uint16);
	long16 __ovld __cnfn convert_long16_sat_rtp(uint16);
	long16 __ovld __cnfn convert_long16_rtn(uint16);
	long16 __ovld __cnfn convert_long16_sat_rtn(uint16);
	long16 __ovld __cnfn convert_long16(uint16);
	long16 __ovld __cnfn convert_long16_sat(uint16);
	long16 __ovld __cnfn convert_long16_rte(long16);
	long16 __ovld __cnfn convert_long16_sat_rte(long16);
	long16 __ovld __cnfn convert_long16_rtz(long16);
	long16 __ovld __cnfn convert_long16_sat_rtz(long16);
	long16 __ovld __cnfn convert_long16_rtp(long16);
	long16 __ovld __cnfn convert_long16_sat_rtp(long16);
	long16 __ovld __cnfn convert_long16_rtn(long16);
	long16 __ovld __cnfn convert_long16_sat_rtn(long16);
	long16 __ovld __cnfn convert_long16(long16);
	long16 __ovld __cnfn convert_long16_sat(long16);
	long16 __ovld __cnfn convert_long16_rte(ulong16);
	long16 __ovld __cnfn convert_long16_sat_rte(ulong16);
	long16 __ovld __cnfn convert_long16_rtz(ulong16);
	long16 __ovld __cnfn convert_long16_sat_rtz(ulong16);
	long16 __ovld __cnfn convert_long16_rtp(ulong16);
	long16 __ovld __cnfn convert_long16_sat_rtp(ulong16);
	long16 __ovld __cnfn convert_long16_rtn(ulong16);
	long16 __ovld __cnfn convert_long16_sat_rtn(ulong16);
	long16 __ovld __cnfn convert_long16(ulong16);
	long16 __ovld __cnfn convert_long16_sat(ulong16);
	long16 __ovld __cnfn convert_long16_rte(float16);
	long16 __ovld __cnfn convert_long16_sat_rte(float16);
	long16 __ovld __cnfn convert_long16_rtz(float16);
	long16 __ovld __cnfn convert_long16_sat_rtz(float16);
	long16 __ovld __cnfn convert_long16_rtp(float16);
	long16 __ovld __cnfn convert_long16_sat_rtp(float16);
	long16 __ovld __cnfn convert_long16_rtn(float16);
	long16 __ovld __cnfn convert_long16_sat_rtn(float16);
	long16 __ovld __cnfn convert_long16(float16);
	long16 __ovld __cnfn convert_long16_sat(float16);
	ulong16 __ovld __cnfn convert_ulong16_rte(char16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(char16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(char16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(char16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(char16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(char16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(char16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(char16);
	ulong16 __ovld __cnfn convert_ulong16(char16);
	ulong16 __ovld __cnfn convert_ulong16_sat(char16);
	ulong16 __ovld __cnfn convert_ulong16_rte(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(uchar16);
	ulong16 __ovld __cnfn convert_ulong16(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_sat(uchar16);
	ulong16 __ovld __cnfn convert_ulong16_rte(short16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(short16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(short16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(short16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(short16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(short16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(short16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(short16);
	ulong16 __ovld __cnfn convert_ulong16(short16);
	ulong16 __ovld __cnfn convert_ulong16_sat(short16);
	ulong16 __ovld __cnfn convert_ulong16_rte(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(ushort16);
	ulong16 __ovld __cnfn convert_ulong16(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_sat(ushort16);
	ulong16 __ovld __cnfn convert_ulong16_rte(int16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(int16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(int16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(int16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(int16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(int16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(int16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(int16);
	ulong16 __ovld __cnfn convert_ulong16(int16);
	ulong16 __ovld __cnfn convert_ulong16_sat(int16);
	ulong16 __ovld __cnfn convert_ulong16_rte(uint16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(uint16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(uint16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(uint16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(uint16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(uint16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(uint16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(uint16);
	ulong16 __ovld __cnfn convert_ulong16(uint16);
	ulong16 __ovld __cnfn convert_ulong16_sat(uint16);
	ulong16 __ovld __cnfn convert_ulong16_rte(long16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(long16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(long16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(long16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(long16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(long16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(long16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(long16);
	ulong16 __ovld __cnfn convert_ulong16(long16);
	ulong16 __ovld __cnfn convert_ulong16_sat(long16);
	ulong16 __ovld __cnfn convert_ulong16_rte(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(ulong16);
	ulong16 __ovld __cnfn convert_ulong16(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_sat(ulong16);
	ulong16 __ovld __cnfn convert_ulong16_rte(float16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(float16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(float16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(float16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(float16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(float16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(float16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(float16);
	ulong16 __ovld __cnfn convert_ulong16(float16);
	ulong16 __ovld __cnfn convert_ulong16_sat(float16);
	float16 __ovld __cnfn convert_float16_rte(char16);
	float16 __ovld __cnfn convert_float16_rtz(char16);
	float16 __ovld __cnfn convert_float16_rtp(char16);
	float16 __ovld __cnfn convert_float16_rtn(char16);
	float16 __ovld __cnfn convert_float16(char16);
	float16 __ovld __cnfn convert_float16_rte(uchar16);
	float16 __ovld __cnfn convert_float16_rtz(uchar16);
	float16 __ovld __cnfn convert_float16_rtp(uchar16);
	float16 __ovld __cnfn convert_float16_rtn(uchar16);
	float16 __ovld __cnfn convert_float16(uchar16);
	float16 __ovld __cnfn convert_float16_rte(short16);
	float16 __ovld __cnfn convert_float16_rtz(short16);
	float16 __ovld __cnfn convert_float16_rtp(short16);
	float16 __ovld __cnfn convert_float16_rtn(short16);
	float16 __ovld __cnfn convert_float16(short16);
	float16 __ovld __cnfn convert_float16_rte(ushort16);
	float16 __ovld __cnfn convert_float16_rtz(ushort16);
	float16 __ovld __cnfn convert_float16_rtp(ushort16);
	float16 __ovld __cnfn convert_float16_rtn(ushort16);
	float16 __ovld __cnfn convert_float16(ushort16);
	float16 __ovld __cnfn convert_float16_rte(int16);
	float16 __ovld __cnfn convert_float16_rtz(int16);
	float16 __ovld __cnfn convert_float16_rtp(int16);
	float16 __ovld __cnfn convert_float16_rtn(int16);
	float16 __ovld __cnfn convert_float16(int16);
	float16 __ovld __cnfn convert_float16_rte(uint16);
	float16 __ovld __cnfn convert_float16_rtz(uint16);
	float16 __ovld __cnfn convert_float16_rtp(uint16);
	float16 __ovld __cnfn convert_float16_rtn(uint16);
	float16 __ovld __cnfn convert_float16(uint16);
	float16 __ovld __cnfn convert_float16_rte(long16);
	float16 __ovld __cnfn convert_float16_rtz(long16);
	float16 __ovld __cnfn convert_float16_rtp(long16);
	float16 __ovld __cnfn convert_float16_rtn(long16);
	float16 __ovld __cnfn convert_float16(long16);
	float16 __ovld __cnfn convert_float16_rte(ulong16);
	float16 __ovld __cnfn convert_float16_rtz(ulong16);
	float16 __ovld __cnfn convert_float16_rtp(ulong16);
	float16 __ovld __cnfn convert_float16_rtn(ulong16);
	float16 __ovld __cnfn convert_float16(ulong16);
	float16 __ovld __cnfn convert_float16_rte(float16);
	float16 __ovld __cnfn convert_float16_rtz(float16);
	float16 __ovld __cnfn convert_float16_rtp(float16);
	float16 __ovld __cnfn convert_float16_rtn(float16);
	float16 __ovld __cnfn convert_float16(float16);

	// Conversions with double data type parameters or return value.

	#ifdef cl_khr_fp64
	char __ovld __cnfn convert_char(double);
	char __ovld __cnfn convert_char_rte(double);
	char __ovld __cnfn convert_char_rtn(double);
	char __ovld __cnfn convert_char_rtp(double);
	char __ovld __cnfn convert_char_rtz(double);
	char __ovld __cnfn convert_char_sat(double);
	char __ovld __cnfn convert_char_sat_rte(double);
	char __ovld __cnfn convert_char_sat_rtn(double);
	char __ovld __cnfn convert_char_sat_rtp(double);
	char __ovld __cnfn convert_char_sat_rtz(double);
	char2 __ovld __cnfn convert_char2(double2);
	char2 __ovld __cnfn convert_char2_rte(double2);
	char2 __ovld __cnfn convert_char2_rtn(double2);
	char2 __ovld __cnfn convert_char2_rtp(double2);
	char2 __ovld __cnfn convert_char2_rtz(double2);
	char2 __ovld __cnfn convert_char2_sat(double2);
	char2 __ovld __cnfn convert_char2_sat_rte(double2);
	char2 __ovld __cnfn convert_char2_sat_rtn(double2);
	char2 __ovld __cnfn convert_char2_sat_rtp(double2);
	char2 __ovld __cnfn convert_char2_sat_rtz(double2);
	char3 __ovld __cnfn convert_char3(double3);
	char3 __ovld __cnfn convert_char3_rte(double3);
	char3 __ovld __cnfn convert_char3_rtn(double3);
	char3 __ovld __cnfn convert_char3_rtp(double3);
	char3 __ovld __cnfn convert_char3_rtz(double3);
	char3 __ovld __cnfn convert_char3_sat(double3);
	char3 __ovld __cnfn convert_char3_sat_rte(double3);
	char3 __ovld __cnfn convert_char3_sat_rtn(double3);
	char3 __ovld __cnfn convert_char3_sat_rtp(double3);
	char3 __ovld __cnfn convert_char3_sat_rtz(double3);
	char4 __ovld __cnfn convert_char4(double4);
	char4 __ovld __cnfn convert_char4_rte(double4);
	char4 __ovld __cnfn convert_char4_rtn(double4);
	char4 __ovld __cnfn convert_char4_rtp(double4);
	char4 __ovld __cnfn convert_char4_rtz(double4);
	char4 __ovld __cnfn convert_char4_sat(double4);
	char4 __ovld __cnfn convert_char4_sat_rte(double4);
	char4 __ovld __cnfn convert_char4_sat_rtn(double4);
	char4 __ovld __cnfn convert_char4_sat_rtp(double4);
	char4 __ovld __cnfn convert_char4_sat_rtz(double4);
	char8 __ovld __cnfn convert_char8(double8);
	char8 __ovld __cnfn convert_char8_rte(double8);
	char8 __ovld __cnfn convert_char8_rtn(double8);
	char8 __ovld __cnfn convert_char8_rtp(double8);
	char8 __ovld __cnfn convert_char8_rtz(double8);
	char8 __ovld __cnfn convert_char8_sat(double8);
	char8 __ovld __cnfn convert_char8_sat_rte(double8);
	char8 __ovld __cnfn convert_char8_sat_rtn(double8);
	char8 __ovld __cnfn convert_char8_sat_rtp(double8);
	char8 __ovld __cnfn convert_char8_sat_rtz(double8);
	char16 __ovld __cnfn convert_char16(double16);
	char16 __ovld __cnfn convert_char16_rte(double16);
	char16 __ovld __cnfn convert_char16_rtn(double16);
	char16 __ovld __cnfn convert_char16_rtp(double16);
	char16 __ovld __cnfn convert_char16_rtz(double16);
	char16 __ovld __cnfn convert_char16_sat(double16);
	char16 __ovld __cnfn convert_char16_sat_rte(double16);
	char16 __ovld __cnfn convert_char16_sat_rtn(double16);
	char16 __ovld __cnfn convert_char16_sat_rtp(double16);
	char16 __ovld __cnfn convert_char16_sat_rtz(double16);

	uchar __ovld __cnfn convert_uchar(double);
	uchar __ovld __cnfn convert_uchar_rte(double);
	uchar __ovld __cnfn convert_uchar_rtn(double);
	uchar __ovld __cnfn convert_uchar_rtp(double);
	uchar __ovld __cnfn convert_uchar_rtz(double);
	uchar __ovld __cnfn convert_uchar_sat(double);
	uchar __ovld __cnfn convert_uchar_sat_rte(double);
	uchar __ovld __cnfn convert_uchar_sat_rtn(double);
	uchar __ovld __cnfn convert_uchar_sat_rtp(double);
	uchar __ovld __cnfn convert_uchar_sat_rtz(double);
	uchar2 __ovld __cnfn convert_uchar2(double2);
	uchar2 __ovld __cnfn convert_uchar2_rte(double2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(double2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(double2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(double2);
	uchar2 __ovld __cnfn convert_uchar2_sat(double2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(double2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(double2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(double2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(double2);
	uchar3 __ovld __cnfn convert_uchar3(double3);
	uchar3 __ovld __cnfn convert_uchar3_rte(double3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(double3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(double3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(double3);
	uchar3 __ovld __cnfn convert_uchar3_sat(double3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(double3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(double3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(double3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(double3);
	uchar4 __ovld __cnfn convert_uchar4(double4);
	uchar4 __ovld __cnfn convert_uchar4_rte(double4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(double4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(double4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(double4);
	uchar4 __ovld __cnfn convert_uchar4_sat(double4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(double4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(double4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(double4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(double4);
	uchar8 __ovld __cnfn convert_uchar8(double8);
	uchar8 __ovld __cnfn convert_uchar8_rte(double8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(double8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(double8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(double8);
	uchar8 __ovld __cnfn convert_uchar8_sat(double8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(double8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(double8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(double8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(double8);
	uchar16 __ovld __cnfn convert_uchar16(double16);
	uchar16 __ovld __cnfn convert_uchar16_rte(double16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(double16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(double16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(double16);
	uchar16 __ovld __cnfn convert_uchar16_sat(double16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(double16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(double16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(double16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(double16);

	short __ovld __cnfn convert_short(double);
	short __ovld __cnfn convert_short_rte(double);
	short __ovld __cnfn convert_short_rtn(double);
	short __ovld __cnfn convert_short_rtp(double);
	short __ovld __cnfn convert_short_rtz(double);
	short __ovld __cnfn convert_short_sat(double);
	short __ovld __cnfn convert_short_sat_rte(double);
	short __ovld __cnfn convert_short_sat_rtn(double);
	short __ovld __cnfn convert_short_sat_rtp(double);
	short __ovld __cnfn convert_short_sat_rtz(double);
	short2 __ovld __cnfn convert_short2(double2);
	short2 __ovld __cnfn convert_short2_rte(double2);
	short2 __ovld __cnfn convert_short2_rtn(double2);
	short2 __ovld __cnfn convert_short2_rtp(double2);
	short2 __ovld __cnfn convert_short2_rtz(double2);
	short2 __ovld __cnfn convert_short2_sat(double2);
	short2 __ovld __cnfn convert_short2_sat_rte(double2);
	short2 __ovld __cnfn convert_short2_sat_rtn(double2);
	short2 __ovld __cnfn convert_short2_sat_rtp(double2);
	short2 __ovld __cnfn convert_short2_sat_rtz(double2);
	short3 __ovld __cnfn convert_short3(double3);
	short3 __ovld __cnfn convert_short3_rte(double3);
	short3 __ovld __cnfn convert_short3_rtn(double3);
	short3 __ovld __cnfn convert_short3_rtp(double3);
	short3 __ovld __cnfn convert_short3_rtz(double3);
	short3 __ovld __cnfn convert_short3_sat(double3);
	short3 __ovld __cnfn convert_short3_sat_rte(double3);
	short3 __ovld __cnfn convert_short3_sat_rtn(double3);
	short3 __ovld __cnfn convert_short3_sat_rtp(double3);
	short3 __ovld __cnfn convert_short3_sat_rtz(double3);
	short4 __ovld __cnfn convert_short4(double4);
	short4 __ovld __cnfn convert_short4_rte(double4);
	short4 __ovld __cnfn convert_short4_rtn(double4);
	short4 __ovld __cnfn convert_short4_rtp(double4);
	short4 __ovld __cnfn convert_short4_rtz(double4);
	short4 __ovld __cnfn convert_short4_sat(double4);
	short4 __ovld __cnfn convert_short4_sat_rte(double4);
	short4 __ovld __cnfn convert_short4_sat_rtn(double4);
	short4 __ovld __cnfn convert_short4_sat_rtp(double4);
	short4 __ovld __cnfn convert_short4_sat_rtz(double4);
	short8 __ovld __cnfn convert_short8(double8);
	short8 __ovld __cnfn convert_short8_rte(double8);
	short8 __ovld __cnfn convert_short8_rtn(double8);
	short8 __ovld __cnfn convert_short8_rtp(double8);
	short8 __ovld __cnfn convert_short8_rtz(double8);
	short8 __ovld __cnfn convert_short8_sat(double8);
	short8 __ovld __cnfn convert_short8_sat_rte(double8);
	short8 __ovld __cnfn convert_short8_sat_rtn(double8);
	short8 __ovld __cnfn convert_short8_sat_rtp(double8);
	short8 __ovld __cnfn convert_short8_sat_rtz(double8);
	short16 __ovld __cnfn convert_short16(double16);
	short16 __ovld __cnfn convert_short16_rte(double16);
	short16 __ovld __cnfn convert_short16_rtn(double16);
	short16 __ovld __cnfn convert_short16_rtp(double16);
	short16 __ovld __cnfn convert_short16_rtz(double16);
	short16 __ovld __cnfn convert_short16_sat(double16);
	short16 __ovld __cnfn convert_short16_sat_rte(double16);
	short16 __ovld __cnfn convert_short16_sat_rtn(double16);
	short16 __ovld __cnfn convert_short16_sat_rtp(double16);
	short16 __ovld __cnfn convert_short16_sat_rtz(double16);

	ushort __ovld __cnfn convert_ushort(double);
	ushort __ovld __cnfn convert_ushort_rte(double);
	ushort __ovld __cnfn convert_ushort_rtn(double);
	ushort __ovld __cnfn convert_ushort_rtp(double);
	ushort __ovld __cnfn convert_ushort_rtz(double);
	ushort __ovld __cnfn convert_ushort_sat(double);
	ushort __ovld __cnfn convert_ushort_sat_rte(double);
	ushort __ovld __cnfn convert_ushort_sat_rtn(double);
	ushort __ovld __cnfn convert_ushort_sat_rtp(double);
	ushort __ovld __cnfn convert_ushort_sat_rtz(double);
	ushort2 __ovld __cnfn convert_ushort2(double2);
	ushort2 __ovld __cnfn convert_ushort2_rte(double2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(double2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(double2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(double2);
	ushort2 __ovld __cnfn convert_ushort2_sat(double2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(double2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(double2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(double2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(double2);
	ushort3 __ovld __cnfn convert_ushort3(double3);
	ushort3 __ovld __cnfn convert_ushort3_rte(double3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(double3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(double3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(double3);
	ushort3 __ovld __cnfn convert_ushort3_sat(double3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(double3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(double3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(double3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(double3);
	ushort4 __ovld __cnfn convert_ushort4(double4);
	ushort4 __ovld __cnfn convert_ushort4_rte(double4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(double4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(double4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(double4);
	ushort4 __ovld __cnfn convert_ushort4_sat(double4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(double4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(double4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(double4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(double4);
	ushort8 __ovld __cnfn convert_ushort8(double8);
	ushort8 __ovld __cnfn convert_ushort8_rte(double8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(double8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(double8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(double8);
	ushort8 __ovld __cnfn convert_ushort8_sat(double8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(double8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(double8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(double8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(double8);
	ushort16 __ovld __cnfn convert_ushort16(double16);
	ushort16 __ovld __cnfn convert_ushort16_rte(double16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(double16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(double16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(double16);
	ushort16 __ovld __cnfn convert_ushort16_sat(double16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(double16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(double16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(double16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(double16);

	int __ovld __cnfn convert_int(double);
	int __ovld __cnfn convert_int_rte(double);
	int __ovld __cnfn convert_int_rtn(double);
	int __ovld __cnfn convert_int_rtp(double);
	int __ovld __cnfn convert_int_rtz(double);
	int __ovld __cnfn convert_int_sat(double);
	int __ovld __cnfn convert_int_sat_rte(double);
	int __ovld __cnfn convert_int_sat_rtn(double);
	int __ovld __cnfn convert_int_sat_rtp(double);
	int __ovld __cnfn convert_int_sat_rtz(double);
	int2 __ovld __cnfn convert_int2(double2);
	int2 __ovld __cnfn convert_int2_rte(double2);
	int2 __ovld __cnfn convert_int2_rtn(double2);
	int2 __ovld __cnfn convert_int2_rtp(double2);
	int2 __ovld __cnfn convert_int2_rtz(double2);
	int2 __ovld __cnfn convert_int2_sat(double2);
	int2 __ovld __cnfn convert_int2_sat_rte(double2);
	int2 __ovld __cnfn convert_int2_sat_rtn(double2);
	int2 __ovld __cnfn convert_int2_sat_rtp(double2);
	int2 __ovld __cnfn convert_int2_sat_rtz(double2);
	int3 __ovld __cnfn convert_int3(double3);
	int3 __ovld __cnfn convert_int3_rte(double3);
	int3 __ovld __cnfn convert_int3_rtn(double3);
	int3 __ovld __cnfn convert_int3_rtp(double3);
	int3 __ovld __cnfn convert_int3_rtz(double3);
	int3 __ovld __cnfn convert_int3_sat(double3);
	int3 __ovld __cnfn convert_int3_sat_rte(double3);
	int3 __ovld __cnfn convert_int3_sat_rtn(double3);
	int3 __ovld __cnfn convert_int3_sat_rtp(double3);
	int3 __ovld __cnfn convert_int3_sat_rtz(double3);
	int4 __ovld __cnfn convert_int4(double4);
	int4 __ovld __cnfn convert_int4_rte(double4);
	int4 __ovld __cnfn convert_int4_rtn(double4);
	int4 __ovld __cnfn convert_int4_rtp(double4);
	int4 __ovld __cnfn convert_int4_rtz(double4);
	int4 __ovld __cnfn convert_int4_sat(double4);
	int4 __ovld __cnfn convert_int4_sat_rte(double4);
	int4 __ovld __cnfn convert_int4_sat_rtn(double4);
	int4 __ovld __cnfn convert_int4_sat_rtp(double4);
	int4 __ovld __cnfn convert_int4_sat_rtz(double4);
	int8 __ovld __cnfn convert_int8(double8);
	int8 __ovld __cnfn convert_int8_rte(double8);
	int8 __ovld __cnfn convert_int8_rtn(double8);
	int8 __ovld __cnfn convert_int8_rtp(double8);
	int8 __ovld __cnfn convert_int8_rtz(double8);
	int8 __ovld __cnfn convert_int8_sat(double8);
	int8 __ovld __cnfn convert_int8_sat_rte(double8);
	int8 __ovld __cnfn convert_int8_sat_rtn(double8);
	int8 __ovld __cnfn convert_int8_sat_rtp(double8);
	int8 __ovld __cnfn convert_int8_sat_rtz(double8);
	int16 __ovld __cnfn convert_int16(double16);
	int16 __ovld __cnfn convert_int16_rte(double16);
	int16 __ovld __cnfn convert_int16_rtn(double16);
	int16 __ovld __cnfn convert_int16_rtp(double16);
	int16 __ovld __cnfn convert_int16_rtz(double16);
	int16 __ovld __cnfn convert_int16_sat(double16);
	int16 __ovld __cnfn convert_int16_sat_rte(double16);
	int16 __ovld __cnfn convert_int16_sat_rtn(double16);
	int16 __ovld __cnfn convert_int16_sat_rtp(double16);
	int16 __ovld __cnfn convert_int16_sat_rtz(double16);

	uint __ovld __cnfn convert_uint(double);
	uint __ovld __cnfn convert_uint_rte(double);
	uint __ovld __cnfn convert_uint_rtn(double);
	uint __ovld __cnfn convert_uint_rtp(double);
	uint __ovld __cnfn convert_uint_rtz(double);
	uint __ovld __cnfn convert_uint_sat(double);
	uint __ovld __cnfn convert_uint_sat_rte(double);
	uint __ovld __cnfn convert_uint_sat_rtn(double);
	uint __ovld __cnfn convert_uint_sat_rtp(double);
	uint __ovld __cnfn convert_uint_sat_rtz(double);
	uint2 __ovld __cnfn convert_uint2(double2);
	uint2 __ovld __cnfn convert_uint2_rte(double2);
	uint2 __ovld __cnfn convert_uint2_rtn(double2);
	uint2 __ovld __cnfn convert_uint2_rtp(double2);
	uint2 __ovld __cnfn convert_uint2_rtz(double2);
	uint2 __ovld __cnfn convert_uint2_sat(double2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(double2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(double2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(double2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(double2);
	uint3 __ovld __cnfn convert_uint3(double3);
	uint3 __ovld __cnfn convert_uint3_rte(double3);
	uint3 __ovld __cnfn convert_uint3_rtn(double3);
	uint3 __ovld __cnfn convert_uint3_rtp(double3);
	uint3 __ovld __cnfn convert_uint3_rtz(double3);
	uint3 __ovld __cnfn convert_uint3_sat(double3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(double3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(double3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(double3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(double3);
	uint4 __ovld __cnfn convert_uint4(double4);
	uint4 __ovld __cnfn convert_uint4_rte(double4);
	uint4 __ovld __cnfn convert_uint4_rtn(double4);
	uint4 __ovld __cnfn convert_uint4_rtp(double4);
	uint4 __ovld __cnfn convert_uint4_rtz(double4);
	uint4 __ovld __cnfn convert_uint4_sat(double4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(double4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(double4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(double4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(double4);
	uint8 __ovld __cnfn convert_uint8(double8);
	uint8 __ovld __cnfn convert_uint8_rte(double8);
	uint8 __ovld __cnfn convert_uint8_rtn(double8);
	uint8 __ovld __cnfn convert_uint8_rtp(double8);
	uint8 __ovld __cnfn convert_uint8_rtz(double8);
	uint8 __ovld __cnfn convert_uint8_sat(double8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(double8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(double8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(double8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(double8);
	uint16 __ovld __cnfn convert_uint16(double16);
	uint16 __ovld __cnfn convert_uint16_rte(double16);
	uint16 __ovld __cnfn convert_uint16_rtn(double16);
	uint16 __ovld __cnfn convert_uint16_rtp(double16);
	uint16 __ovld __cnfn convert_uint16_rtz(double16);
	uint16 __ovld __cnfn convert_uint16_sat(double16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(double16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(double16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(double16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(double16);

	long __ovld __cnfn convert_long(double);
	long __ovld __cnfn convert_long_rte(double);
	long __ovld __cnfn convert_long_rtn(double);
	long __ovld __cnfn convert_long_rtp(double);
	long __ovld __cnfn convert_long_rtz(double);
	long __ovld __cnfn convert_long_sat(double);
	long __ovld __cnfn convert_long_sat_rte(double);
	long __ovld __cnfn convert_long_sat_rtn(double);
	long __ovld __cnfn convert_long_sat_rtp(double);
	long __ovld __cnfn convert_long_sat_rtz(double);
	long2 __ovld __cnfn convert_long2(double2);
	long2 __ovld __cnfn convert_long2_rte(double2);
	long2 __ovld __cnfn convert_long2_rtn(double2);
	long2 __ovld __cnfn convert_long2_rtp(double2);
	long2 __ovld __cnfn convert_long2_rtz(double2);
	long2 __ovld __cnfn convert_long2_sat(double2);
	long2 __ovld __cnfn convert_long2_sat_rte(double2);
	long2 __ovld __cnfn convert_long2_sat_rtn(double2);
	long2 __ovld __cnfn convert_long2_sat_rtp(double2);
	long2 __ovld __cnfn convert_long2_sat_rtz(double2);
	long3 __ovld __cnfn convert_long3(double3);
	long3 __ovld __cnfn convert_long3_rte(double3);
	long3 __ovld __cnfn convert_long3_rtn(double3);
	long3 __ovld __cnfn convert_long3_rtp(double3);
	long3 __ovld __cnfn convert_long3_rtz(double3);
	long3 __ovld __cnfn convert_long3_sat(double3);
	long3 __ovld __cnfn convert_long3_sat_rte(double3);
	long3 __ovld __cnfn convert_long3_sat_rtn(double3);
	long3 __ovld __cnfn convert_long3_sat_rtp(double3);
	long3 __ovld __cnfn convert_long3_sat_rtz(double3);
	long4 __ovld __cnfn convert_long4(double4);
	long4 __ovld __cnfn convert_long4_rte(double4);
	long4 __ovld __cnfn convert_long4_rtn(double4);
	long4 __ovld __cnfn convert_long4_rtp(double4);
	long4 __ovld __cnfn convert_long4_rtz(double4);
	long4 __ovld __cnfn convert_long4_sat(double4);
	long4 __ovld __cnfn convert_long4_sat_rte(double4);
	long4 __ovld __cnfn convert_long4_sat_rtn(double4);
	long4 __ovld __cnfn convert_long4_sat_rtp(double4);
	long4 __ovld __cnfn convert_long4_sat_rtz(double4);
	long8 __ovld __cnfn convert_long8(double8);
	long8 __ovld __cnfn convert_long8_rte(double8);
	long8 __ovld __cnfn convert_long8_rtn(double8);
	long8 __ovld __cnfn convert_long8_rtp(double8);
	long8 __ovld __cnfn convert_long8_rtz(double8);
	long8 __ovld __cnfn convert_long8_sat(double8);
	long8 __ovld __cnfn convert_long8_sat_rte(double8);
	long8 __ovld __cnfn convert_long8_sat_rtn(double8);
	long8 __ovld __cnfn convert_long8_sat_rtp(double8);
	long8 __ovld __cnfn convert_long8_sat_rtz(double8);
	long16 __ovld __cnfn convert_long16(double16);
	long16 __ovld __cnfn convert_long16_rte(double16);
	long16 __ovld __cnfn convert_long16_rtn(double16);
	long16 __ovld __cnfn convert_long16_rtp(double16);
	long16 __ovld __cnfn convert_long16_rtz(double16);
	long16 __ovld __cnfn convert_long16_sat(double16);
	long16 __ovld __cnfn convert_long16_sat_rte(double16);
	long16 __ovld __cnfn convert_long16_sat_rtn(double16);
	long16 __ovld __cnfn convert_long16_sat_rtp(double16);
	long16 __ovld __cnfn convert_long16_sat_rtz(double16);

	ulong __ovld __cnfn convert_ulong(double);
	ulong __ovld __cnfn convert_ulong_rte(double);
	ulong __ovld __cnfn convert_ulong_rtn(double);
	ulong __ovld __cnfn convert_ulong_rtp(double);
	ulong __ovld __cnfn convert_ulong_rtz(double);
	ulong __ovld __cnfn convert_ulong_sat(double);
	ulong __ovld __cnfn convert_ulong_sat_rte(double);
	ulong __ovld __cnfn convert_ulong_sat_rtn(double);
	ulong __ovld __cnfn convert_ulong_sat_rtp(double);
	ulong __ovld __cnfn convert_ulong_sat_rtz(double);
	ulong2 __ovld __cnfn convert_ulong2(double2);
	ulong2 __ovld __cnfn convert_ulong2_rte(double2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(double2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(double2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(double2);
	ulong2 __ovld __cnfn convert_ulong2_sat(double2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(double2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(double2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(double2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(double2);
	ulong3 __ovld __cnfn convert_ulong3(double3);
	ulong3 __ovld __cnfn convert_ulong3_rte(double3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(double3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(double3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(double3);
	ulong3 __ovld __cnfn convert_ulong3_sat(double3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(double3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(double3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(double3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(double3);
	ulong4 __ovld __cnfn convert_ulong4(double4);
	ulong4 __ovld __cnfn convert_ulong4_rte(double4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(double4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(double4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(double4);
	ulong4 __ovld __cnfn convert_ulong4_sat(double4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(double4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(double4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(double4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(double4);
	ulong8 __ovld __cnfn convert_ulong8(double8);
	ulong8 __ovld __cnfn convert_ulong8_rte(double8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(double8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(double8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(double8);
	ulong8 __ovld __cnfn convert_ulong8_sat(double8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(double8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(double8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(double8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(double8);
	ulong16 __ovld __cnfn convert_ulong16(double16);
	ulong16 __ovld __cnfn convert_ulong16_rte(double16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(double16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(double16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(double16);
	ulong16 __ovld __cnfn convert_ulong16_sat(double16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(double16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(double16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(double16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(double16);

	float __ovld __cnfn convert_float(double);
	float __ovld __cnfn convert_float_rte(double);
	float __ovld __cnfn convert_float_rtn(double);
	float __ovld __cnfn convert_float_rtp(double);
	float __ovld __cnfn convert_float_rtz(double);
	float2 __ovld __cnfn convert_float2(double2);
	float2 __ovld __cnfn convert_float2_rte(double2);
	float2 __ovld __cnfn convert_float2_rtn(double2);
	float2 __ovld __cnfn convert_float2_rtp(double2);
	float2 __ovld __cnfn convert_float2_rtz(double2);
	float3 __ovld __cnfn convert_float3(double3);
	float3 __ovld __cnfn convert_float3_rte(double3);
	float3 __ovld __cnfn convert_float3_rtn(double3);
	float3 __ovld __cnfn convert_float3_rtp(double3);
	float3 __ovld __cnfn convert_float3_rtz(double3);
	float4 __ovld __cnfn convert_float4(double4);
	float4 __ovld __cnfn convert_float4_rte(double4);
	float4 __ovld __cnfn convert_float4_rtn(double4);
	float4 __ovld __cnfn convert_float4_rtp(double4);
	float4 __ovld __cnfn convert_float4_rtz(double4);
	float8 __ovld __cnfn convert_float8(double8);
	float8 __ovld __cnfn convert_float8_rte(double8);
	float8 __ovld __cnfn convert_float8_rtn(double8);
	float8 __ovld __cnfn convert_float8_rtp(double8);
	float8 __ovld __cnfn convert_float8_rtz(double8);
	float16 __ovld __cnfn convert_float16(double16);
	float16 __ovld __cnfn convert_float16_rte(double16);
	float16 __ovld __cnfn convert_float16_rtn(double16);
	float16 __ovld __cnfn convert_float16_rtp(double16);
	float16 __ovld __cnfn convert_float16_rtz(double16);

	double __ovld __cnfn convert_double(char);
	double __ovld __cnfn convert_double(double);
	double __ovld __cnfn convert_double(float);
	double __ovld __cnfn convert_double(int);
	double __ovld __cnfn convert_double(long);
	double __ovld __cnfn convert_double(short);
	double __ovld __cnfn convert_double(uchar);
	double __ovld __cnfn convert_double(uint);
	double __ovld __cnfn convert_double(ulong);
	double __ovld __cnfn convert_double(ushort);
	double __ovld __cnfn convert_double_rte(char);
	double __ovld __cnfn convert_double_rte(double);
	double __ovld __cnfn convert_double_rte(float);
	double __ovld __cnfn convert_double_rte(int);
	double __ovld __cnfn convert_double_rte(long);
	double __ovld __cnfn convert_double_rte(short);
	double __ovld __cnfn convert_double_rte(uchar);
	double __ovld __cnfn convert_double_rte(uint);
	double __ovld __cnfn convert_double_rte(ulong);
	double __ovld __cnfn convert_double_rte(ushort);
	double __ovld __cnfn convert_double_rtn(char);
	double __ovld __cnfn convert_double_rtn(double);
	double __ovld __cnfn convert_double_rtn(float);
	double __ovld __cnfn convert_double_rtn(int);
	double __ovld __cnfn convert_double_rtn(long);
	double __ovld __cnfn convert_double_rtn(short);
	double __ovld __cnfn convert_double_rtn(uchar);
	double __ovld __cnfn convert_double_rtn(uint);
	double __ovld __cnfn convert_double_rtn(ulong);
	double __ovld __cnfn convert_double_rtn(ushort);
	double __ovld __cnfn convert_double_rtp(char);
	double __ovld __cnfn convert_double_rtp(double);
	double __ovld __cnfn convert_double_rtp(float);
	double __ovld __cnfn convert_double_rtp(int);
	double __ovld __cnfn convert_double_rtp(long);
	double __ovld __cnfn convert_double_rtp(short);
	double __ovld __cnfn convert_double_rtp(uchar);
	double __ovld __cnfn convert_double_rtp(uint);
	double __ovld __cnfn convert_double_rtp(ulong);
	double __ovld __cnfn convert_double_rtp(ushort);
	double __ovld __cnfn convert_double_rtz(char);
	double __ovld __cnfn convert_double_rtz(double);
	double __ovld __cnfn convert_double_rtz(float);
	double __ovld __cnfn convert_double_rtz(int);
	double __ovld __cnfn convert_double_rtz(long);
	double __ovld __cnfn convert_double_rtz(short);
	double __ovld __cnfn convert_double_rtz(uchar);
	double __ovld __cnfn convert_double_rtz(uint);
	double __ovld __cnfn convert_double_rtz(ulong);
	double __ovld __cnfn convert_double_rtz(ushort);
	double2 __ovld __cnfn convert_double2(char2);
	double2 __ovld __cnfn convert_double2(double2);
	double2 __ovld __cnfn convert_double2(float2);
	double2 __ovld __cnfn convert_double2(int2);
	double2 __ovld __cnfn convert_double2(long2);
	double2 __ovld __cnfn convert_double2(short2);
	double2 __ovld __cnfn convert_double2(uchar2);
	double2 __ovld __cnfn convert_double2(uint2);
	double2 __ovld __cnfn convert_double2(ulong2);
	double2 __ovld __cnfn convert_double2(ushort2);
	double2 __ovld __cnfn convert_double2_rte(char2);
	double2 __ovld __cnfn convert_double2_rte(double2);
	double2 __ovld __cnfn convert_double2_rte(float2);
	double2 __ovld __cnfn convert_double2_rte(int2);
	double2 __ovld __cnfn convert_double2_rte(long2);
	double2 __ovld __cnfn convert_double2_rte(short2);
	double2 __ovld __cnfn convert_double2_rte(uchar2);
	double2 __ovld __cnfn convert_double2_rte(uint2);
	double2 __ovld __cnfn convert_double2_rte(ulong2);
	double2 __ovld __cnfn convert_double2_rte(ushort2);
	double2 __ovld __cnfn convert_double2_rtn(char2);
	double2 __ovld __cnfn convert_double2_rtn(double2);
	double2 __ovld __cnfn convert_double2_rtn(float2);
	double2 __ovld __cnfn convert_double2_rtn(int2);
	double2 __ovld __cnfn convert_double2_rtn(long2);
	double2 __ovld __cnfn convert_double2_rtn(short2);
	double2 __ovld __cnfn convert_double2_rtn(uchar2);
	double2 __ovld __cnfn convert_double2_rtn(uint2);
	double2 __ovld __cnfn convert_double2_rtn(ulong2);
	double2 __ovld __cnfn convert_double2_rtn(ushort2);
	double2 __ovld __cnfn convert_double2_rtp(char2);
	double2 __ovld __cnfn convert_double2_rtp(double2);
	double2 __ovld __cnfn convert_double2_rtp(float2);
	double2 __ovld __cnfn convert_double2_rtp(int2);
	double2 __ovld __cnfn convert_double2_rtp(long2);
	double2 __ovld __cnfn convert_double2_rtp(short2);
	double2 __ovld __cnfn convert_double2_rtp(uchar2);
	double2 __ovld __cnfn convert_double2_rtp(uint2);
	double2 __ovld __cnfn convert_double2_rtp(ulong2);
	double2 __ovld __cnfn convert_double2_rtp(ushort2);
	double2 __ovld __cnfn convert_double2_rtz(char2);
	double2 __ovld __cnfn convert_double2_rtz(double2);
	double2 __ovld __cnfn convert_double2_rtz(float2);
	double2 __ovld __cnfn convert_double2_rtz(int2);
	double2 __ovld __cnfn convert_double2_rtz(long2);
	double2 __ovld __cnfn convert_double2_rtz(short2);
	double2 __ovld __cnfn convert_double2_rtz(uchar2);
	double2 __ovld __cnfn convert_double2_rtz(uint2);
	double2 __ovld __cnfn convert_double2_rtz(ulong2);
	double2 __ovld __cnfn convert_double2_rtz(ushort2);
	double3 __ovld __cnfn convert_double3(char3);
	double3 __ovld __cnfn convert_double3(double3);
	double3 __ovld __cnfn convert_double3(float3);
	double3 __ovld __cnfn convert_double3(int3);
	double3 __ovld __cnfn convert_double3(long3);
	double3 __ovld __cnfn convert_double3(short3);
	double3 __ovld __cnfn convert_double3(uchar3);
	double3 __ovld __cnfn convert_double3(uint3);
	double3 __ovld __cnfn convert_double3(ulong3);
	double3 __ovld __cnfn convert_double3(ushort3);
	double3 __ovld __cnfn convert_double3_rte(char3);
	double3 __ovld __cnfn convert_double3_rte(double3);
	double3 __ovld __cnfn convert_double3_rte(float3);
	double3 __ovld __cnfn convert_double3_rte(int3);
	double3 __ovld __cnfn convert_double3_rte(long3);
	double3 __ovld __cnfn convert_double3_rte(short3);
	double3 __ovld __cnfn convert_double3_rte(uchar3);
	double3 __ovld __cnfn convert_double3_rte(uint3);
	double3 __ovld __cnfn convert_double3_rte(ulong3);
	double3 __ovld __cnfn convert_double3_rte(ushort3);
	double3 __ovld __cnfn convert_double3_rtn(char3);
	double3 __ovld __cnfn convert_double3_rtn(double3);
	double3 __ovld __cnfn convert_double3_rtn(float3);
	double3 __ovld __cnfn convert_double3_rtn(int3);
	double3 __ovld __cnfn convert_double3_rtn(long3);
	double3 __ovld __cnfn convert_double3_rtn(short3);
	double3 __ovld __cnfn convert_double3_rtn(uchar3);
	double3 __ovld __cnfn convert_double3_rtn(uint3);
	double3 __ovld __cnfn convert_double3_rtn(ulong3);
	double3 __ovld __cnfn convert_double3_rtn(ushort3);
	double3 __ovld __cnfn convert_double3_rtp(char3);
	double3 __ovld __cnfn convert_double3_rtp(double3);
	double3 __ovld __cnfn convert_double3_rtp(float3);
	double3 __ovld __cnfn convert_double3_rtp(int3);
	double3 __ovld __cnfn convert_double3_rtp(long3);
	double3 __ovld __cnfn convert_double3_rtp(short3);
	double3 __ovld __cnfn convert_double3_rtp(uchar3);
	double3 __ovld __cnfn convert_double3_rtp(uint3);
	double3 __ovld __cnfn convert_double3_rtp(ulong3);
	double3 __ovld __cnfn convert_double3_rtp(ushort3);
	double3 __ovld __cnfn convert_double3_rtz(char3);
	double3 __ovld __cnfn convert_double3_rtz(double3);
	double3 __ovld __cnfn convert_double3_rtz(float3);
	double3 __ovld __cnfn convert_double3_rtz(int3);
	double3 __ovld __cnfn convert_double3_rtz(long3);
	double3 __ovld __cnfn convert_double3_rtz(short3);
	double3 __ovld __cnfn convert_double3_rtz(uchar3);
	double3 __ovld __cnfn convert_double3_rtz(uint3);
	double3 __ovld __cnfn convert_double3_rtz(ulong3);
	double3 __ovld __cnfn convert_double3_rtz(ushort3);
	double4 __ovld __cnfn convert_double4(char4);
	double4 __ovld __cnfn convert_double4(double4);
	double4 __ovld __cnfn convert_double4(float4);
	double4 __ovld __cnfn convert_double4(int4);
	double4 __ovld __cnfn convert_double4(long4);
	double4 __ovld __cnfn convert_double4(short4);
	double4 __ovld __cnfn convert_double4(uchar4);
	double4 __ovld __cnfn convert_double4(uint4);
	double4 __ovld __cnfn convert_double4(ulong4);
	double4 __ovld __cnfn convert_double4(ushort4);
	double4 __ovld __cnfn convert_double4_rte(char4);
	double4 __ovld __cnfn convert_double4_rte(double4);
	double4 __ovld __cnfn convert_double4_rte(float4);
	double4 __ovld __cnfn convert_double4_rte(int4);
	double4 __ovld __cnfn convert_double4_rte(long4);
	double4 __ovld __cnfn convert_double4_rte(short4);
	double4 __ovld __cnfn convert_double4_rte(uchar4);
	double4 __ovld __cnfn convert_double4_rte(uint4);
	double4 __ovld __cnfn convert_double4_rte(ulong4);
	double4 __ovld __cnfn convert_double4_rte(ushort4);
	double4 __ovld __cnfn convert_double4_rtn(char4);
	double4 __ovld __cnfn convert_double4_rtn(double4);
	double4 __ovld __cnfn convert_double4_rtn(float4);
	double4 __ovld __cnfn convert_double4_rtn(int4);
	double4 __ovld __cnfn convert_double4_rtn(long4);
	double4 __ovld __cnfn convert_double4_rtn(short4);
	double4 __ovld __cnfn convert_double4_rtn(uchar4);
	double4 __ovld __cnfn convert_double4_rtn(uint4);
	double4 __ovld __cnfn convert_double4_rtn(ulong4);
	double4 __ovld __cnfn convert_double4_rtn(ushort4);
	double4 __ovld __cnfn convert_double4_rtp(char4);
	double4 __ovld __cnfn convert_double4_rtp(double4);
	double4 __ovld __cnfn convert_double4_rtp(float4);
	double4 __ovld __cnfn convert_double4_rtp(int4);
	double4 __ovld __cnfn convert_double4_rtp(long4);
	double4 __ovld __cnfn convert_double4_rtp(short4);
	double4 __ovld __cnfn convert_double4_rtp(uchar4);
	double4 __ovld __cnfn convert_double4_rtp(uint4);
	double4 __ovld __cnfn convert_double4_rtp(ulong4);
	double4 __ovld __cnfn convert_double4_rtp(ushort4);
	double4 __ovld __cnfn convert_double4_rtz(char4);
	double4 __ovld __cnfn convert_double4_rtz(double4);
	double4 __ovld __cnfn convert_double4_rtz(float4);
	double4 __ovld __cnfn convert_double4_rtz(int4);
	double4 __ovld __cnfn convert_double4_rtz(long4);
	double4 __ovld __cnfn convert_double4_rtz(short4);
	double4 __ovld __cnfn convert_double4_rtz(uchar4);
	double4 __ovld __cnfn convert_double4_rtz(uint4);
	double4 __ovld __cnfn convert_double4_rtz(ulong4);
	double4 __ovld __cnfn convert_double4_rtz(ushort4);
	double8 __ovld __cnfn convert_double8(char8);
	double8 __ovld __cnfn convert_double8(double8);
	double8 __ovld __cnfn convert_double8(float8);
	double8 __ovld __cnfn convert_double8(int8);
	double8 __ovld __cnfn convert_double8(long8);
	double8 __ovld __cnfn convert_double8(short8);
	double8 __ovld __cnfn convert_double8(uchar8);
	double8 __ovld __cnfn convert_double8(uint8);
	double8 __ovld __cnfn convert_double8(ulong8);
	double8 __ovld __cnfn convert_double8(ushort8);
	double8 __ovld __cnfn convert_double8_rte(char8);
	double8 __ovld __cnfn convert_double8_rte(double8);
	double8 __ovld __cnfn convert_double8_rte(float8);
	double8 __ovld __cnfn convert_double8_rte(int8);
	double8 __ovld __cnfn convert_double8_rte(long8);
	double8 __ovld __cnfn convert_double8_rte(short8);
	double8 __ovld __cnfn convert_double8_rte(uchar8);
	double8 __ovld __cnfn convert_double8_rte(uint8);
	double8 __ovld __cnfn convert_double8_rte(ulong8);
	double8 __ovld __cnfn convert_double8_rte(ushort8);
	double8 __ovld __cnfn convert_double8_rtn(char8);
	double8 __ovld __cnfn convert_double8_rtn(double8);
	double8 __ovld __cnfn convert_double8_rtn(float8);
	double8 __ovld __cnfn convert_double8_rtn(int8);
	double8 __ovld __cnfn convert_double8_rtn(long8);
	double8 __ovld __cnfn convert_double8_rtn(short8);
	double8 __ovld __cnfn convert_double8_rtn(uchar8);
	double8 __ovld __cnfn convert_double8_rtn(uint8);
	double8 __ovld __cnfn convert_double8_rtn(ulong8);
	double8 __ovld __cnfn convert_double8_rtn(ushort8);
	double8 __ovld __cnfn convert_double8_rtp(char8);
	double8 __ovld __cnfn convert_double8_rtp(double8);
	double8 __ovld __cnfn convert_double8_rtp(float8);
	double8 __ovld __cnfn convert_double8_rtp(int8);
	double8 __ovld __cnfn convert_double8_rtp(long8);
	double8 __ovld __cnfn convert_double8_rtp(short8);
	double8 __ovld __cnfn convert_double8_rtp(uchar8);
	double8 __ovld __cnfn convert_double8_rtp(uint8);
	double8 __ovld __cnfn convert_double8_rtp(ulong8);
	double8 __ovld __cnfn convert_double8_rtp(ushort8);
	double8 __ovld __cnfn convert_double8_rtz(char8);
	double8 __ovld __cnfn convert_double8_rtz(double8);
	double8 __ovld __cnfn convert_double8_rtz(float8);
	double8 __ovld __cnfn convert_double8_rtz(int8);
	double8 __ovld __cnfn convert_double8_rtz(long8);
	double8 __ovld __cnfn convert_double8_rtz(short8);
	double8 __ovld __cnfn convert_double8_rtz(uchar8);
	double8 __ovld __cnfn convert_double8_rtz(uint8);
	double8 __ovld __cnfn convert_double8_rtz(ulong8);
	double8 __ovld __cnfn convert_double8_rtz(ushort8);
	double16 __ovld __cnfn convert_double16(char16);
	double16 __ovld __cnfn convert_double16(double16);
	double16 __ovld __cnfn convert_double16(float16);
	double16 __ovld __cnfn convert_double16(int16);
	double16 __ovld __cnfn convert_double16(long16);
	double16 __ovld __cnfn convert_double16(short16);
	double16 __ovld __cnfn convert_double16(uchar16);
	double16 __ovld __cnfn convert_double16(uint16);
	double16 __ovld __cnfn convert_double16(ulong16);
	double16 __ovld __cnfn convert_double16(ushort16);
	double16 __ovld __cnfn convert_double16_rte(char16);
	double16 __ovld __cnfn convert_double16_rte(double16);
	double16 __ovld __cnfn convert_double16_rte(float16);
	double16 __ovld __cnfn convert_double16_rte(int16);
	double16 __ovld __cnfn convert_double16_rte(long16);
	double16 __ovld __cnfn convert_double16_rte(short16);
	double16 __ovld __cnfn convert_double16_rte(uchar16);
	double16 __ovld __cnfn convert_double16_rte(uint16);
	double16 __ovld __cnfn convert_double16_rte(ulong16);
	double16 __ovld __cnfn convert_double16_rte(ushort16);
	double16 __ovld __cnfn convert_double16_rtn(char16);
	double16 __ovld __cnfn convert_double16_rtn(double16);
	double16 __ovld __cnfn convert_double16_rtn(float16);
	double16 __ovld __cnfn convert_double16_rtn(int16);
	double16 __ovld __cnfn convert_double16_rtn(long16);
	double16 __ovld __cnfn convert_double16_rtn(short16);
	double16 __ovld __cnfn convert_double16_rtn(uchar16);
	double16 __ovld __cnfn convert_double16_rtn(uint16);
	double16 __ovld __cnfn convert_double16_rtn(ulong16);
	double16 __ovld __cnfn convert_double16_rtn(ushort16);
	double16 __ovld __cnfn convert_double16_rtp(char16);
	double16 __ovld __cnfn convert_double16_rtp(double16);
	double16 __ovld __cnfn convert_double16_rtp(float16);
	double16 __ovld __cnfn convert_double16_rtp(int16);
	double16 __ovld __cnfn convert_double16_rtp(long16);
	double16 __ovld __cnfn convert_double16_rtp(short16);
	double16 __ovld __cnfn convert_double16_rtp(uchar16);
	double16 __ovld __cnfn convert_double16_rtp(uint16);
	double16 __ovld __cnfn convert_double16_rtp(ulong16);
	double16 __ovld __cnfn convert_double16_rtp(ushort16);
	double16 __ovld __cnfn convert_double16_rtz(char16);
	double16 __ovld __cnfn convert_double16_rtz(double16);
	double16 __ovld __cnfn convert_double16_rtz(float16);
	double16 __ovld __cnfn convert_double16_rtz(int16);
	double16 __ovld __cnfn convert_double16_rtz(long16);
	double16 __ovld __cnfn convert_double16_rtz(short16);
	double16 __ovld __cnfn convert_double16_rtz(uchar16);
	double16 __ovld __cnfn convert_double16_rtz(uint16);
	double16 __ovld __cnfn convert_double16_rtz(ulong16);
	double16 __ovld __cnfn convert_double16_rtz(ushort16);
	#endif //cl_khr_fp64

	#ifdef cl_khr_fp16
	// Convert half types to non-double types.
	uchar __ovld __cnfn convert_uchar(half);
	uchar __ovld __cnfn convert_uchar_rte(half);
	uchar __ovld __cnfn convert_uchar_rtp(half);
	uchar __ovld __cnfn convert_uchar_rtn(half);
	uchar __ovld __cnfn convert_uchar_rtz(half);
	uchar __ovld __cnfn convert_uchar_sat(half);
	uchar __ovld __cnfn convert_uchar_sat_rte(half);
	uchar __ovld __cnfn convert_uchar_sat_rtp(half);
	uchar __ovld __cnfn convert_uchar_sat_rtn(half);
	uchar __ovld __cnfn convert_uchar_sat_rtz(half);
	uchar2 __ovld __cnfn convert_uchar2(half2);
	uchar2 __ovld __cnfn convert_uchar2_rte(half2);
	uchar2 __ovld __cnfn convert_uchar2_rtp(half2);
	uchar2 __ovld __cnfn convert_uchar2_rtn(half2);
	uchar2 __ovld __cnfn convert_uchar2_rtz(half2);
	uchar2 __ovld __cnfn convert_uchar2_sat(half2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rte(half2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtp(half2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtn(half2);
	uchar2 __ovld __cnfn convert_uchar2_sat_rtz(half2);
	uchar3 __ovld __cnfn convert_uchar3(half3);
	uchar3 __ovld __cnfn convert_uchar3_rte(half3);
	uchar3 __ovld __cnfn convert_uchar3_rtp(half3);
	uchar3 __ovld __cnfn convert_uchar3_rtn(half3);
	uchar3 __ovld __cnfn convert_uchar3_rtz(half3);
	uchar3 __ovld __cnfn convert_uchar3_sat(half3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rte(half3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtp(half3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtn(half3);
	uchar3 __ovld __cnfn convert_uchar3_sat_rtz(half3);
	uchar4 __ovld __cnfn convert_uchar4(half4);
	uchar4 __ovld __cnfn convert_uchar4_rte(half4);
	uchar4 __ovld __cnfn convert_uchar4_rtp(half4);
	uchar4 __ovld __cnfn convert_uchar4_rtn(half4);
	uchar4 __ovld __cnfn convert_uchar4_rtz(half4);
	uchar4 __ovld __cnfn convert_uchar4_sat(half4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rte(half4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtp(half4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtn(half4);
	uchar4 __ovld __cnfn convert_uchar4_sat_rtz(half4);
	uchar8 __ovld __cnfn convert_uchar8(half8);
	uchar8 __ovld __cnfn convert_uchar8_rte(half8);
	uchar8 __ovld __cnfn convert_uchar8_rtp(half8);
	uchar8 __ovld __cnfn convert_uchar8_rtn(half8);
	uchar8 __ovld __cnfn convert_uchar8_rtz(half8);
	uchar8 __ovld __cnfn convert_uchar8_sat(half8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rte(half8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtp(half8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtn(half8);
	uchar8 __ovld __cnfn convert_uchar8_sat_rtz(half8);
	uchar16 __ovld __cnfn convert_uchar16(half16);
	uchar16 __ovld __cnfn convert_uchar16_rte(half16);
	uchar16 __ovld __cnfn convert_uchar16_rtp(half16);
	uchar16 __ovld __cnfn convert_uchar16_rtn(half16);
	uchar16 __ovld __cnfn convert_uchar16_rtz(half16);
	uchar16 __ovld __cnfn convert_uchar16_sat(half16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rte(half16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtp(half16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtn(half16);
	uchar16 __ovld __cnfn convert_uchar16_sat_rtz(half16);
	ushort __ovld __cnfn convert_ushort(half);
	ushort __ovld __cnfn convert_ushort_rte(half);
	ushort __ovld __cnfn convert_ushort_rtp(half);
	ushort __ovld __cnfn convert_ushort_rtn(half);
	ushort __ovld __cnfn convert_ushort_rtz(half);
	ushort __ovld __cnfn convert_ushort_sat(half);
	ushort __ovld __cnfn convert_ushort_sat_rte(half);
	ushort __ovld __cnfn convert_ushort_sat_rtp(half);
	ushort __ovld __cnfn convert_ushort_sat_rtn(half);
	ushort __ovld __cnfn convert_ushort_sat_rtz(half);
	ushort2 __ovld __cnfn convert_ushort2(half2);
	ushort2 __ovld __cnfn convert_ushort2_rte(half2);
	ushort2 __ovld __cnfn convert_ushort2_rtp(half2);
	ushort2 __ovld __cnfn convert_ushort2_rtn(half2);
	ushort2 __ovld __cnfn convert_ushort2_rtz(half2);
	ushort2 __ovld __cnfn convert_ushort2_sat(half2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rte(half2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtp(half2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtn(half2);
	ushort2 __ovld __cnfn convert_ushort2_sat_rtz(half2);
	ushort3 __ovld __cnfn convert_ushort3(half3);
	ushort3 __ovld __cnfn convert_ushort3_rte(half3);
	ushort3 __ovld __cnfn convert_ushort3_rtp(half3);
	ushort3 __ovld __cnfn convert_ushort3_rtn(half3);
	ushort3 __ovld __cnfn convert_ushort3_rtz(half3);
	ushort3 __ovld __cnfn convert_ushort3_sat(half3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rte(half3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtp(half3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtn(half3);
	ushort3 __ovld __cnfn convert_ushort3_sat_rtz(half3);
	ushort4 __ovld __cnfn convert_ushort4(half4);
	ushort4 __ovld __cnfn convert_ushort4_rte(half4);
	ushort4 __ovld __cnfn convert_ushort4_rtp(half4);
	ushort4 __ovld __cnfn convert_ushort4_rtn(half4);
	ushort4 __ovld __cnfn convert_ushort4_rtz(half4);
	ushort4 __ovld __cnfn convert_ushort4_sat(half4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rte(half4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtp(half4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtn(half4);
	ushort4 __ovld __cnfn convert_ushort4_sat_rtz(half4);
	ushort8 __ovld __cnfn convert_ushort8(half8);
	ushort8 __ovld __cnfn convert_ushort8_rte(half8);
	ushort8 __ovld __cnfn convert_ushort8_rtp(half8);
	ushort8 __ovld __cnfn convert_ushort8_rtn(half8);
	ushort8 __ovld __cnfn convert_ushort8_rtz(half8);
	ushort8 __ovld __cnfn convert_ushort8_sat(half8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rte(half8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtp(half8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtn(half8);
	ushort8 __ovld __cnfn convert_ushort8_sat_rtz(half8);
	ushort16 __ovld __cnfn convert_ushort16(half16);
	ushort16 __ovld __cnfn convert_ushort16_rte(half16);
	ushort16 __ovld __cnfn convert_ushort16_rtp(half16);
	ushort16 __ovld __cnfn convert_ushort16_rtn(half16);
	ushort16 __ovld __cnfn convert_ushort16_rtz(half16);
	ushort16 __ovld __cnfn convert_ushort16_sat(half16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rte(half16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtp(half16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtn(half16);
	ushort16 __ovld __cnfn convert_ushort16_sat_rtz(half16);
	uint __ovld __cnfn convert_uint(half);
	uint __ovld __cnfn convert_uint_rte(half);
	uint __ovld __cnfn convert_uint_rtp(half);
	uint __ovld __cnfn convert_uint_rtn(half);
	uint __ovld __cnfn convert_uint_rtz(half);
	uint __ovld __cnfn convert_uint_sat(half);
	uint __ovld __cnfn convert_uint_sat_rte(half);
	uint __ovld __cnfn convert_uint_sat_rtp(half);
	uint __ovld __cnfn convert_uint_sat_rtn(half);
	uint __ovld __cnfn convert_uint_sat_rtz(half);
	uint2 __ovld __cnfn convert_uint2(half2);
	uint2 __ovld __cnfn convert_uint2_rte(half2);
	uint2 __ovld __cnfn convert_uint2_rtp(half2);
	uint2 __ovld __cnfn convert_uint2_rtn(half2);
	uint2 __ovld __cnfn convert_uint2_rtz(half2);
	uint2 __ovld __cnfn convert_uint2_sat(half2);
	uint2 __ovld __cnfn convert_uint2_sat_rte(half2);
	uint2 __ovld __cnfn convert_uint2_sat_rtp(half2);
	uint2 __ovld __cnfn convert_uint2_sat_rtn(half2);
	uint2 __ovld __cnfn convert_uint2_sat_rtz(half2);
	uint3 __ovld __cnfn convert_uint3(half3);
	uint3 __ovld __cnfn convert_uint3_rte(half3);
	uint3 __ovld __cnfn convert_uint3_rtp(half3);
	uint3 __ovld __cnfn convert_uint3_rtn(half3);
	uint3 __ovld __cnfn convert_uint3_rtz(half3);
	uint3 __ovld __cnfn convert_uint3_sat(half3);
	uint3 __ovld __cnfn convert_uint3_sat_rte(half3);
	uint3 __ovld __cnfn convert_uint3_sat_rtp(half3);
	uint3 __ovld __cnfn convert_uint3_sat_rtn(half3);
	uint3 __ovld __cnfn convert_uint3_sat_rtz(half3);
	uint4 __ovld __cnfn convert_uint4(half4);
	uint4 __ovld __cnfn convert_uint4_rte(half4);
	uint4 __ovld __cnfn convert_uint4_rtp(half4);
	uint4 __ovld __cnfn convert_uint4_rtn(half4);
	uint4 __ovld __cnfn convert_uint4_rtz(half4);
	uint4 __ovld __cnfn convert_uint4_sat(half4);
	uint4 __ovld __cnfn convert_uint4_sat_rte(half4);
	uint4 __ovld __cnfn convert_uint4_sat_rtp(half4);
	uint4 __ovld __cnfn convert_uint4_sat_rtn(half4);
	uint4 __ovld __cnfn convert_uint4_sat_rtz(half4);
	uint8 __ovld __cnfn convert_uint8(half8);
	uint8 __ovld __cnfn convert_uint8_rte(half8);
	uint8 __ovld __cnfn convert_uint8_rtp(half8);
	uint8 __ovld __cnfn convert_uint8_rtn(half8);
	uint8 __ovld __cnfn convert_uint8_rtz(half8);
	uint8 __ovld __cnfn convert_uint8_sat(half8);
	uint8 __ovld __cnfn convert_uint8_sat_rte(half8);
	uint8 __ovld __cnfn convert_uint8_sat_rtp(half8);
	uint8 __ovld __cnfn convert_uint8_sat_rtn(half8);
	uint8 __ovld __cnfn convert_uint8_sat_rtz(half8);
	uint16 __ovld __cnfn convert_uint16(half16);
	uint16 __ovld __cnfn convert_uint16_rte(half16);
	uint16 __ovld __cnfn convert_uint16_rtp(half16);
	uint16 __ovld __cnfn convert_uint16_rtn(half16);
	uint16 __ovld __cnfn convert_uint16_rtz(half16);
	uint16 __ovld __cnfn convert_uint16_sat(half16);
	uint16 __ovld __cnfn convert_uint16_sat_rte(half16);
	uint16 __ovld __cnfn convert_uint16_sat_rtp(half16);
	uint16 __ovld __cnfn convert_uint16_sat_rtn(half16);
	uint16 __ovld __cnfn convert_uint16_sat_rtz(half16);
	ulong __ovld __cnfn convert_ulong(half);
	ulong __ovld __cnfn convert_ulong_rte(half);
	ulong __ovld __cnfn convert_ulong_rtp(half);
	ulong __ovld __cnfn convert_ulong_rtn(half);
	ulong __ovld __cnfn convert_ulong_rtz(half);
	ulong __ovld __cnfn convert_ulong_sat(half);
	ulong __ovld __cnfn convert_ulong_sat_rte(half);
	ulong __ovld __cnfn convert_ulong_sat_rtp(half);
	ulong __ovld __cnfn convert_ulong_sat_rtn(half);
	ulong __ovld __cnfn convert_ulong_sat_rtz(half);
	ulong2 __ovld __cnfn convert_ulong2(half2);
	ulong2 __ovld __cnfn convert_ulong2_rte(half2);
	ulong2 __ovld __cnfn convert_ulong2_rtp(half2);
	ulong2 __ovld __cnfn convert_ulong2_rtn(half2);
	ulong2 __ovld __cnfn convert_ulong2_rtz(half2);
	ulong2 __ovld __cnfn convert_ulong2_sat(half2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rte(half2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtp(half2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtn(half2);
	ulong2 __ovld __cnfn convert_ulong2_sat_rtz(half2);
	ulong3 __ovld __cnfn convert_ulong3(half3);
	ulong3 __ovld __cnfn convert_ulong3_rte(half3);
	ulong3 __ovld __cnfn convert_ulong3_rtp(half3);
	ulong3 __ovld __cnfn convert_ulong3_rtn(half3);
	ulong3 __ovld __cnfn convert_ulong3_rtz(half3);
	ulong3 __ovld __cnfn convert_ulong3_sat(half3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rte(half3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtp(half3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtn(half3);
	ulong3 __ovld __cnfn convert_ulong3_sat_rtz(half3);
	ulong4 __ovld __cnfn convert_ulong4(half4);
	ulong4 __ovld __cnfn convert_ulong4_rte(half4);
	ulong4 __ovld __cnfn convert_ulong4_rtp(half4);
	ulong4 __ovld __cnfn convert_ulong4_rtn(half4);
	ulong4 __ovld __cnfn convert_ulong4_rtz(half4);
	ulong4 __ovld __cnfn convert_ulong4_sat(half4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rte(half4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtp(half4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtn(half4);
	ulong4 __ovld __cnfn convert_ulong4_sat_rtz(half4);
	ulong8 __ovld __cnfn convert_ulong8(half8);
	ulong8 __ovld __cnfn convert_ulong8_rte(half8);
	ulong8 __ovld __cnfn convert_ulong8_rtp(half8);
	ulong8 __ovld __cnfn convert_ulong8_rtn(half8);
	ulong8 __ovld __cnfn convert_ulong8_rtz(half8);
	ulong8 __ovld __cnfn convert_ulong8_sat(half8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rte(half8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtp(half8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtn(half8);
	ulong8 __ovld __cnfn convert_ulong8_sat_rtz(half8);
	ulong16 __ovld __cnfn convert_ulong16(half16);
	ulong16 __ovld __cnfn convert_ulong16_rte(half16);
	ulong16 __ovld __cnfn convert_ulong16_rtp(half16);
	ulong16 __ovld __cnfn convert_ulong16_rtn(half16);
	ulong16 __ovld __cnfn convert_ulong16_rtz(half16);
	ulong16 __ovld __cnfn convert_ulong16_sat(half16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rte(half16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtp(half16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtn(half16);
	ulong16 __ovld __cnfn convert_ulong16_sat_rtz(half16);
	char __ovld __cnfn convert_char(half);
	char __ovld __cnfn convert_char_rte(half);
	char __ovld __cnfn convert_char_rtp(half);
	char __ovld __cnfn convert_char_rtn(half);
	char __ovld __cnfn convert_char_rtz(half);
	char __ovld __cnfn convert_char_sat(half);
	char __ovld __cnfn convert_char_sat_rte(half);
	char __ovld __cnfn convert_char_sat_rtp(half);
	char __ovld __cnfn convert_char_sat_rtn(half);
	char __ovld __cnfn convert_char_sat_rtz(half);
	char2 __ovld __cnfn convert_char2(half2);
	char2 __ovld __cnfn convert_char2_rte(half2);
	char2 __ovld __cnfn convert_char2_rtp(half2);
	char2 __ovld __cnfn convert_char2_rtn(half2);
	char2 __ovld __cnfn convert_char2_rtz(half2);
	char2 __ovld __cnfn convert_char2_sat(half2);
	char2 __ovld __cnfn convert_char2_sat_rte(half2);
	char2 __ovld __cnfn convert_char2_sat_rtp(half2);
	char2 __ovld __cnfn convert_char2_sat_rtn(half2);
	char2 __ovld __cnfn convert_char2_sat_rtz(half2);
	char3 __ovld __cnfn convert_char3(half3);
	char3 __ovld __cnfn convert_char3_rte(half3);
	char3 __ovld __cnfn convert_char3_rtp(half3);
	char3 __ovld __cnfn convert_char3_rtn(half3);
	char3 __ovld __cnfn convert_char3_rtz(half3);
	char3 __ovld __cnfn convert_char3_sat(half3);
	char3 __ovld __cnfn convert_char3_sat_rte(half3);
	char3 __ovld __cnfn convert_char3_sat_rtp(half3);
	char3 __ovld __cnfn convert_char3_sat_rtn(half3);
	char3 __ovld __cnfn convert_char3_sat_rtz(half3);
	char4 __ovld __cnfn convert_char4(half4);
	char4 __ovld __cnfn convert_char4_rte(half4);
	char4 __ovld __cnfn convert_char4_rtp(half4);
	char4 __ovld __cnfn convert_char4_rtn(half4);
	char4 __ovld __cnfn convert_char4_rtz(half4);
	char4 __ovld __cnfn convert_char4_sat(half4);
	char4 __ovld __cnfn convert_char4_sat_rte(half4);
	char4 __ovld __cnfn convert_char4_sat_rtp(half4);
	char4 __ovld __cnfn convert_char4_sat_rtn(half4);
	char4 __ovld __cnfn convert_char4_sat_rtz(half4);
	char8 __ovld __cnfn convert_char8(half8);
	char8 __ovld __cnfn convert_char8_rte(half8);
	char8 __ovld __cnfn convert_char8_rtp(half8);
	char8 __ovld __cnfn convert_char8_rtn(half8);
	char8 __ovld __cnfn convert_char8_rtz(half8);
	char8 __ovld __cnfn convert_char8_sat(half8);
	char8 __ovld __cnfn convert_char8_sat_rte(half8);
	char8 __ovld __cnfn convert_char8_sat_rtp(half8);
	char8 __ovld __cnfn convert_char8_sat_rtn(half8);
	char8 __ovld __cnfn convert_char8_sat_rtz(half8);
	char16 __ovld __cnfn convert_char16(half16);
	char16 __ovld __cnfn convert_char16_rte(half16);
	char16 __ovld __cnfn convert_char16_rtp(half16);
	char16 __ovld __cnfn convert_char16_rtn(half16);
	char16 __ovld __cnfn convert_char16_rtz(half16);
	char16 __ovld __cnfn convert_char16_sat(half16);
	char16 __ovld __cnfn convert_char16_sat_rte(half16);
	char16 __ovld __cnfn convert_char16_sat_rtp(half16);
	char16 __ovld __cnfn convert_char16_sat_rtn(half16);
	char16 __ovld __cnfn convert_char16_sat_rtz(half16);
	short __ovld __cnfn convert_short(half);
	short __ovld __cnfn convert_short_rte(half);
	short __ovld __cnfn convert_short_rtp(half);
	short __ovld __cnfn convert_short_rtn(half);
	short __ovld __cnfn convert_short_rtz(half);
	short __ovld __cnfn convert_short_sat(half);
	short __ovld __cnfn convert_short_sat_rte(half);
	short __ovld __cnfn convert_short_sat_rtp(half);
	short __ovld __cnfn convert_short_sat_rtn(half);
	short __ovld __cnfn convert_short_sat_rtz(half);
	short2 __ovld __cnfn convert_short2(half2);
	short2 __ovld __cnfn convert_short2_rte(half2);
	short2 __ovld __cnfn convert_short2_rtp(half2);
	short2 __ovld __cnfn convert_short2_rtn(half2);
	short2 __ovld __cnfn convert_short2_rtz(half2);
	short2 __ovld __cnfn convert_short2_sat(half2);
	short2 __ovld __cnfn convert_short2_sat_rte(half2);
	short2 __ovld __cnfn convert_short2_sat_rtp(half2);
	short2 __ovld __cnfn convert_short2_sat_rtn(half2);
	short2 __ovld __cnfn convert_short2_sat_rtz(half2);
	short3 __ovld __cnfn convert_short3(half3);
	short3 __ovld __cnfn convert_short3_rte(half3);
	short3 __ovld __cnfn convert_short3_rtp(half3);
	short3 __ovld __cnfn convert_short3_rtn(half3);
	short3 __ovld __cnfn convert_short3_rtz(half3);
	short3 __ovld __cnfn convert_short3_sat(half3);
	short3 __ovld __cnfn convert_short3_sat_rte(half3);
	short3 __ovld __cnfn convert_short3_sat_rtp(half3);
	short3 __ovld __cnfn convert_short3_sat_rtn(half3);
	short3 __ovld __cnfn convert_short3_sat_rtz(half3);
	short4 __ovld __cnfn convert_short4(half4);
	short4 __ovld __cnfn convert_short4_rte(half4);
	short4 __ovld __cnfn convert_short4_rtp(half4);
	short4 __ovld __cnfn convert_short4_rtn(half4);
	short4 __ovld __cnfn convert_short4_rtz(half4);
	short4 __ovld __cnfn convert_short4_sat(half4);
	short4 __ovld __cnfn convert_short4_sat_rte(half4);
	short4 __ovld __cnfn convert_short4_sat_rtp(half4);
	short4 __ovld __cnfn convert_short4_sat_rtn(half4);
	short4 __ovld __cnfn convert_short4_sat_rtz(half4);
	short8 __ovld __cnfn convert_short8(half8);
	short8 __ovld __cnfn convert_short8_rte(half8);
	short8 __ovld __cnfn convert_short8_rtp(half8);
	short8 __ovld __cnfn convert_short8_rtn(half8);
	short8 __ovld __cnfn convert_short8_rtz(half8);
	short8 __ovld __cnfn convert_short8_sat(half8);
	short8 __ovld __cnfn convert_short8_sat_rte(half8);
	short8 __ovld __cnfn convert_short8_sat_rtp(half8);
	short8 __ovld __cnfn convert_short8_sat_rtn(half8);
	short8 __ovld __cnfn convert_short8_sat_rtz(half8);
	short16 __ovld __cnfn convert_short16(half16);
	short16 __ovld __cnfn convert_short16_rte(half16);
	short16 __ovld __cnfn convert_short16_rtp(half16);
	short16 __ovld __cnfn convert_short16_rtn(half16);
	short16 __ovld __cnfn convert_short16_rtz(half16);
	short16 __ovld __cnfn convert_short16_sat(half16);
	short16 __ovld __cnfn convert_short16_sat_rte(half16);
	short16 __ovld __cnfn convert_short16_sat_rtp(half16);
	short16 __ovld __cnfn convert_short16_sat_rtn(half16);
	short16 __ovld __cnfn convert_short16_sat_rtz(half16);
	int __ovld __cnfn convert_int(half);
	int __ovld __cnfn convert_int_rte(half);
	int __ovld __cnfn convert_int_rtp(half);
	int __ovld __cnfn convert_int_rtn(half);
	int __ovld __cnfn convert_int_rtz(half);
	int __ovld __cnfn convert_int_sat(half);
	int __ovld __cnfn convert_int_sat_rte(half);
	int __ovld __cnfn convert_int_sat_rtp(half);
	int __ovld __cnfn convert_int_sat_rtn(half);
	int __ovld __cnfn convert_int_sat_rtz(half);
	int2 __ovld __cnfn convert_int2(half2);
	int2 __ovld __cnfn convert_int2_rte(half2);
	int2 __ovld __cnfn convert_int2_rtp(half2);
	int2 __ovld __cnfn convert_int2_rtn(half2);
	int2 __ovld __cnfn convert_int2_rtz(half2);
	int2 __ovld __cnfn convert_int2_sat(half2);
	int2 __ovld __cnfn convert_int2_sat_rte(half2);
	int2 __ovld __cnfn convert_int2_sat_rtp(half2);
	int2 __ovld __cnfn convert_int2_sat_rtn(half2);
	int2 __ovld __cnfn convert_int2_sat_rtz(half2);
	int3 __ovld __cnfn convert_int3(half3);
	int3 __ovld __cnfn convert_int3_rte(half3);
	int3 __ovld __cnfn convert_int3_rtp(half3);
	int3 __ovld __cnfn convert_int3_rtn(half3);
	int3 __ovld __cnfn convert_int3_rtz(half3);
	int3 __ovld __cnfn convert_int3_sat(half3);
	int3 __ovld __cnfn convert_int3_sat_rte(half3);
	int3 __ovld __cnfn convert_int3_sat_rtp(half3);
	int3 __ovld __cnfn convert_int3_sat_rtn(half3);
	int3 __ovld __cnfn convert_int3_sat_rtz(half3);
	int4 __ovld __cnfn convert_int4(half4);
	int4 __ovld __cnfn convert_int4_rte(half4);
	int4 __ovld __cnfn convert_int4_rtp(half4);
	int4 __ovld __cnfn convert_int4_rtn(half4);
	int4 __ovld __cnfn convert_int4_rtz(half4);
	int4 __ovld __cnfn convert_int4_sat(half4);
	int4 __ovld __cnfn convert_int4_sat_rte(half4);
	int4 __ovld __cnfn convert_int4_sat_rtp(half4);
	int4 __ovld __cnfn convert_int4_sat_rtn(half4);
	int4 __ovld __cnfn convert_int4_sat_rtz(half4);
	int8 __ovld __cnfn convert_int8(half8);
	int8 __ovld __cnfn convert_int8_rte(half8);
	int8 __ovld __cnfn convert_int8_rtp(half8);
	int8 __ovld __cnfn convert_int8_rtn(half8);
	int8 __ovld __cnfn convert_int8_rtz(half8);
	int8 __ovld __cnfn convert_int8_sat(half8);
	int8 __ovld __cnfn convert_int8_sat_rte(half8);
	int8 __ovld __cnfn convert_int8_sat_rtp(half8);
	int8 __ovld __cnfn convert_int8_sat_rtn(half8);
	int8 __ovld __cnfn convert_int8_sat_rtz(half8);
	int16 __ovld __cnfn convert_int16(half16);
	int16 __ovld __cnfn convert_int16_rte(half16);
	int16 __ovld __cnfn convert_int16_rtp(half16);
	int16 __ovld __cnfn convert_int16_rtn(half16);
	int16 __ovld __cnfn convert_int16_rtz(half16);
	int16 __ovld __cnfn convert_int16_sat(half16);
	int16 __ovld __cnfn convert_int16_sat_rte(half16);
	int16 __ovld __cnfn convert_int16_sat_rtp(half16);
	int16 __ovld __cnfn convert_int16_sat_rtn(half16);
	int16 __ovld __cnfn convert_int16_sat_rtz(half16);
	long __ovld __cnfn convert_long(half);
	long __ovld __cnfn convert_long_rte(half);
	long __ovld __cnfn convert_long_rtp(half);
	long __ovld __cnfn convert_long_rtn(half);
	long __ovld __cnfn convert_long_rtz(half);
	long __ovld __cnfn convert_long_sat(half);
	long __ovld __cnfn convert_long_sat_rte(half);
	long __ovld __cnfn convert_long_sat_rtp(half);
	long __ovld __cnfn convert_long_sat_rtn(half);
	long __ovld __cnfn convert_long_sat_rtz(half);
	long2 __ovld __cnfn convert_long2(half2);
	long2 __ovld __cnfn convert_long2_rte(half2);
	long2 __ovld __cnfn convert_long2_rtp(half2);
	long2 __ovld __cnfn convert_long2_rtn(half2);
	long2 __ovld __cnfn convert_long2_rtz(half2);
	long2 __ovld __cnfn convert_long2_sat(half2);
	long2 __ovld __cnfn convert_long2_sat_rte(half2);
	long2 __ovld __cnfn convert_long2_sat_rtp(half2);
	long2 __ovld __cnfn convert_long2_sat_rtn(half2);
	long2 __ovld __cnfn convert_long2_sat_rtz(half2);
	long3 __ovld __cnfn convert_long3(half3);
	long3 __ovld __cnfn convert_long3_rte(half3);
	long3 __ovld __cnfn convert_long3_rtp(half3);
	long3 __ovld __cnfn convert_long3_rtn(half3);
	long3 __ovld __cnfn convert_long3_rtz(half3);
	long3 __ovld __cnfn convert_long3_sat(half3);
	long3 __ovld __cnfn convert_long3_sat_rte(half3);
	long3 __ovld __cnfn convert_long3_sat_rtp(half3);
	long3 __ovld __cnfn convert_long3_sat_rtn(half3);
	long3 __ovld __cnfn convert_long3_sat_rtz(half3);
	long4 __ovld __cnfn convert_long4(half4);
	long4 __ovld __cnfn convert_long4_rte(half4);
	long4 __ovld __cnfn convert_long4_rtp(half4);
	long4 __ovld __cnfn convert_long4_rtn(half4);
	long4 __ovld __cnfn convert_long4_rtz(half4);
	long4 __ovld __cnfn convert_long4_sat(half4);
	long4 __ovld __cnfn convert_long4_sat_rte(half4);
	long4 __ovld __cnfn convert_long4_sat_rtp(half4);
	long4 __ovld __cnfn convert_long4_sat_rtn(half4);
	long4 __ovld __cnfn convert_long4_sat_rtz(half4);
	long8 __ovld __cnfn convert_long8(half8);
	long8 __ovld __cnfn convert_long8_rte(half8);
	long8 __ovld __cnfn convert_long8_rtp(half8);
	long8 __ovld __cnfn convert_long8_rtn(half8);
	long8 __ovld __cnfn convert_long8_rtz(half8);
	long8 __ovld __cnfn convert_long8_sat(half8);
	long8 __ovld __cnfn convert_long8_sat_rte(half8);
	long8 __ovld __cnfn convert_long8_sat_rtp(half8);
	long8 __ovld __cnfn convert_long8_sat_rtn(half8);
	long8 __ovld __cnfn convert_long8_sat_rtz(half8);
	long16 __ovld __cnfn convert_long16(half16);
	long16 __ovld __cnfn convert_long16_rte(half16);
	long16 __ovld __cnfn convert_long16_rtp(half16);
	long16 __ovld __cnfn convert_long16_rtn(half16);
	long16 __ovld __cnfn convert_long16_rtz(half16);
	long16 __ovld __cnfn convert_long16_sat(half16);
	long16 __ovld __cnfn convert_long16_sat_rte(half16);
	long16 __ovld __cnfn convert_long16_sat_rtp(half16);
	long16 __ovld __cnfn convert_long16_sat_rtn(half16);
	long16 __ovld __cnfn convert_long16_sat_rtz(half16);
	float __ovld __cnfn convert_float(half);
	float __ovld __cnfn convert_float_rte(half);
	float __ovld __cnfn convert_float_rtp(half);
	float __ovld __cnfn convert_float_rtn(half);
	float __ovld __cnfn convert_float_rtz(half);
	float2 __ovld __cnfn convert_float2(half2);
	float2 __ovld __cnfn convert_float2_rte(half2);
	float2 __ovld __cnfn convert_float2_rtp(half2);
	float2 __ovld __cnfn convert_float2_rtn(half2);
	float2 __ovld __cnfn convert_float2_rtz(half2);
	float3 __ovld __cnfn convert_float3(half3);
	float3 __ovld __cnfn convert_float3_rte(half3);
	float3 __ovld __cnfn convert_float3_rtp(half3);
	float3 __ovld __cnfn convert_float3_rtn(half3);
	float3 __ovld __cnfn convert_float3_rtz(half3);
	float4 __ovld __cnfn convert_float4(half4);
	float4 __ovld __cnfn convert_float4_rte(half4);
	float4 __ovld __cnfn convert_float4_rtp(half4);
	float4 __ovld __cnfn convert_float4_rtn(half4);
	float4 __ovld __cnfn convert_float4_rtz(half4);
	float8 __ovld __cnfn convert_float8(half8);
	float8 __ovld __cnfn convert_float8_rte(half8);
	float8 __ovld __cnfn convert_float8_rtp(half8);
	float8 __ovld __cnfn convert_float8_rtn(half8);
	float8 __ovld __cnfn convert_float8_rtz(half8);
	float16 __ovld __cnfn convert_float16(half16);
	float16 __ovld __cnfn convert_float16_rte(half16);
	float16 __ovld __cnfn convert_float16_rtp(half16);
	float16 __ovld __cnfn convert_float16_rtn(half16);
	float16 __ovld __cnfn convert_float16_rtz(half16);

	// Convert non-double types to half types.
	half __ovld __cnfn convert_half(uchar);
	half __ovld __cnfn convert_half(ushort);
	half __ovld __cnfn convert_half(uint);
	half __ovld __cnfn convert_half(ulong);
	half __ovld __cnfn convert_half(char);
	half __ovld __cnfn convert_half(short);
	half __ovld __cnfn convert_half(int);
	half __ovld __cnfn convert_half(long);
	half __ovld __cnfn convert_half(float);
	half __ovld __cnfn convert_half(half);
	half __ovld __cnfn convert_half_rte(uchar);
	half __ovld __cnfn convert_half_rte(ushort);
	half __ovld __cnfn convert_half_rte(uint);
	half __ovld __cnfn convert_half_rte(ulong);
	half __ovld __cnfn convert_half_rte(char);
	half __ovld __cnfn convert_half_rte(short);
	half __ovld __cnfn convert_half_rte(int);
	half __ovld __cnfn convert_half_rte(long);
	half __ovld __cnfn convert_half_rte(float);
	half __ovld __cnfn convert_half_rte(half);
	half __ovld __cnfn convert_half_rtp(uchar);
	half __ovld __cnfn convert_half_rtp(ushort);
	half __ovld __cnfn convert_half_rtp(uint);
	half __ovld __cnfn convert_half_rtp(ulong);
	half __ovld __cnfn convert_half_rtp(char);
	half __ovld __cnfn convert_half_rtp(short);
	half __ovld __cnfn convert_half_rtp(int);
	half __ovld __cnfn convert_half_rtp(long);
	half __ovld __cnfn convert_half_rtp(float);
	half __ovld __cnfn convert_half_rtp(half);
	half __ovld __cnfn convert_half_rtn(uchar);
	half __ovld __cnfn convert_half_rtn(ushort);
	half __ovld __cnfn convert_half_rtn(uint);
	half __ovld __cnfn convert_half_rtn(ulong);
	half __ovld __cnfn convert_half_rtn(char);
	half __ovld __cnfn convert_half_rtn(short);
	half __ovld __cnfn convert_half_rtn(int);
	half __ovld __cnfn convert_half_rtn(long);
	half __ovld __cnfn convert_half_rtn(float);
	half __ovld __cnfn convert_half_rtn(half);
	half __ovld __cnfn convert_half_rtz(uchar);
	half __ovld __cnfn convert_half_rtz(ushort);
	half __ovld __cnfn convert_half_rtz(uint);
	half __ovld __cnfn convert_half_rtz(ulong);
	half __ovld __cnfn convert_half_rtz(char);
	half __ovld __cnfn convert_half_rtz(short);
	half __ovld __cnfn convert_half_rtz(int);
	half __ovld __cnfn convert_half_rtz(long);
	half __ovld __cnfn convert_half_rtz(float);
	half __ovld __cnfn convert_half_rtz(half);
	half2 __ovld __cnfn convert_half2(char2);
	half2 __ovld __cnfn convert_half2(uchar2);
	half2 __ovld __cnfn convert_half2(short2);
	half2 __ovld __cnfn convert_half2(ushort2);
	half2 __ovld __cnfn convert_half2(int2);
	half2 __ovld __cnfn convert_half2(uint2);
	half2 __ovld __cnfn convert_half2(long2);
	half2 __ovld __cnfn convert_half2(ulong2);
	half2 __ovld __cnfn convert_half2(float2);
	half2 __ovld __cnfn convert_half2(half2);
	half2 __ovld __cnfn convert_half2_rte(char2);
	half2 __ovld __cnfn convert_half2_rte(uchar2);
	half2 __ovld __cnfn convert_half2_rte(short2);
	half2 __ovld __cnfn convert_half2_rte(ushort2);
	half2 __ovld __cnfn convert_half2_rte(int2);
	half2 __ovld __cnfn convert_half2_rte(uint2);
	half2 __ovld __cnfn convert_half2_rte(long2);
	half2 __ovld __cnfn convert_half2_rte(ulong2);
	half2 __ovld __cnfn convert_half2_rte(float2);
	half2 __ovld __cnfn convert_half2_rte(half2);
	half2 __ovld __cnfn convert_half2_rtp(char2);
	half2 __ovld __cnfn convert_half2_rtp(uchar2);
	half2 __ovld __cnfn convert_half2_rtp(short2);
	half2 __ovld __cnfn convert_half2_rtp(ushort2);
	half2 __ovld __cnfn convert_half2_rtp(int2);
	half2 __ovld __cnfn convert_half2_rtp(uint2);
	half2 __ovld __cnfn convert_half2_rtp(long2);
	half2 __ovld __cnfn convert_half2_rtp(ulong2);
	half2 __ovld __cnfn convert_half2_rtp(float2);
	half2 __ovld __cnfn convert_half2_rtp(half2);
	half2 __ovld __cnfn convert_half2_rtn(char2);
	half2 __ovld __cnfn convert_half2_rtn(uchar2);
	half2 __ovld __cnfn convert_half2_rtn(short2);
	half2 __ovld __cnfn convert_half2_rtn(ushort2);
	half2 __ovld __cnfn convert_half2_rtn(int2);
	half2 __ovld __cnfn convert_half2_rtn(uint2);
	half2 __ovld __cnfn convert_half2_rtn(long2);
	half2 __ovld __cnfn convert_half2_rtn(ulong2);
	half2 __ovld __cnfn convert_half2_rtn(float2);
	half2 __ovld __cnfn convert_half2_rtn(half2);
	half2 __ovld __cnfn convert_half2_rtz(char2);
	half2 __ovld __cnfn convert_half2_rtz(uchar2);
	half2 __ovld __cnfn convert_half2_rtz(short2);
	half2 __ovld __cnfn convert_half2_rtz(ushort2);
	half2 __ovld __cnfn convert_half2_rtz(int2);
	half2 __ovld __cnfn convert_half2_rtz(uint2);
	half2 __ovld __cnfn convert_half2_rtz(long2);
	half2 __ovld __cnfn convert_half2_rtz(ulong2);
	half2 __ovld __cnfn convert_half2_rtz(float2);
	half2 __ovld __cnfn convert_half2_rtz(half2);
	half3 __ovld __cnfn convert_half3(char3);
	half3 __ovld __cnfn convert_half3(uchar3);
	half3 __ovld __cnfn convert_half3(short3);
	half3 __ovld __cnfn convert_half3(ushort3);
	half3 __ovld __cnfn convert_half3(int3);
	half3 __ovld __cnfn convert_half3(uint3);
	half3 __ovld __cnfn convert_half3(long3);
	half3 __ovld __cnfn convert_half3(ulong3);
	half3 __ovld __cnfn convert_half3(float3);
	half3 __ovld __cnfn convert_half3(half3);
	half3 __ovld __cnfn convert_half3_rte(char3);
	half3 __ovld __cnfn convert_half3_rte(uchar3);
	half3 __ovld __cnfn convert_half3_rte(short3);
	half3 __ovld __cnfn convert_half3_rte(ushort3);
	half3 __ovld __cnfn convert_half3_rte(int3);
	half3 __ovld __cnfn convert_half3_rte(uint3);
	half3 __ovld __cnfn convert_half3_rte(long3);
	half3 __ovld __cnfn convert_half3_rte(ulong3);
	half3 __ovld __cnfn convert_half3_rte(float3);
	half3 __ovld __cnfn convert_half3_rte(half3);
	half3 __ovld __cnfn convert_half3_rtp(char3);
	half3 __ovld __cnfn convert_half3_rtp(uchar3);
	half3 __ovld __cnfn convert_half3_rtp(short3);
	half3 __ovld __cnfn convert_half3_rtp(ushort3);
	half3 __ovld __cnfn convert_half3_rtp(int3);
	half3 __ovld __cnfn convert_half3_rtp(uint3);
	half3 __ovld __cnfn convert_half3_rtp(long3);
	half3 __ovld __cnfn convert_half3_rtp(ulong3);
	half3 __ovld __cnfn convert_half3_rtp(float3);
	half3 __ovld __cnfn convert_half3_rtp(half3);
	half3 __ovld __cnfn convert_half3_rtn(char3);
	half3 __ovld __cnfn convert_half3_rtn(uchar3);
	half3 __ovld __cnfn convert_half3_rtn(short3);
	half3 __ovld __cnfn convert_half3_rtn(ushort3);
	half3 __ovld __cnfn convert_half3_rtn(int3);
	half3 __ovld __cnfn convert_half3_rtn(uint3);
	half3 __ovld __cnfn convert_half3_rtn(long3);
	half3 __ovld __cnfn convert_half3_rtn(ulong3);
	half3 __ovld __cnfn convert_half3_rtn(float3);
	half3 __ovld __cnfn convert_half3_rtn(half3);
	half3 __ovld __cnfn convert_half3_rtz(char3);
	half3 __ovld __cnfn convert_half3_rtz(uchar3);
	half3 __ovld __cnfn convert_half3_rtz(short3);
	half3 __ovld __cnfn convert_half3_rtz(ushort3);
	half3 __ovld __cnfn convert_half3_rtz(int3);
	half3 __ovld __cnfn convert_half3_rtz(uint3);
	half3 __ovld __cnfn convert_half3_rtz(long3);
	half3 __ovld __cnfn convert_half3_rtz(ulong3);
	half3 __ovld __cnfn convert_half3_rtz(float3);
	half3 __ovld __cnfn convert_half3_rtz(half3);
	half4 __ovld __cnfn convert_half4(char4);
	half4 __ovld __cnfn convert_half4(uchar4);
	half4 __ovld __cnfn convert_half4(short4);
	half4 __ovld __cnfn convert_half4(ushort4);
	half4 __ovld __cnfn convert_half4(int4);
	half4 __ovld __cnfn convert_half4(uint4);
	half4 __ovld __cnfn convert_half4(long4);
	half4 __ovld __cnfn convert_half4(ulong4);
	half4 __ovld __cnfn convert_half4(float4);
	half4 __ovld __cnfn convert_half4(half4);
	half4 __ovld __cnfn convert_half4_rte(char4);
	half4 __ovld __cnfn convert_half4_rte(uchar4);
	half4 __ovld __cnfn convert_half4_rte(short4);
	half4 __ovld __cnfn convert_half4_rte(ushort4);
	half4 __ovld __cnfn convert_half4_rte(int4);
	half4 __ovld __cnfn convert_half4_rte(uint4);
	half4 __ovld __cnfn convert_half4_rte(long4);
	half4 __ovld __cnfn convert_half4_rte(ulong4);
	half4 __ovld __cnfn convert_half4_rte(float4);
	half4 __ovld __cnfn convert_half4_rte(half4);
	half4 __ovld __cnfn convert_half4_rtp(char4);
	half4 __ovld __cnfn convert_half4_rtp(uchar4);
	half4 __ovld __cnfn convert_half4_rtp(short4);
	half4 __ovld __cnfn convert_half4_rtp(ushort4);
	half4 __ovld __cnfn convert_half4_rtp(int4);
	half4 __ovld __cnfn convert_half4_rtp(uint4);
	half4 __ovld __cnfn convert_half4_rtp(long4);
	half4 __ovld __cnfn convert_half4_rtp(ulong4);
	half4 __ovld __cnfn convert_half4_rtp(float4);
	half4 __ovld __cnfn convert_half4_rtp(half4);
	half4 __ovld __cnfn convert_half4_rtn(char4);
	half4 __ovld __cnfn convert_half4_rtn(uchar4);
	half4 __ovld __cnfn convert_half4_rtn(short4);
	half4 __ovld __cnfn convert_half4_rtn(ushort4);
	half4 __ovld __cnfn convert_half4_rtn(int4);
	half4 __ovld __cnfn convert_half4_rtn(uint4);
	half4 __ovld __cnfn convert_half4_rtn(long4);
	half4 __ovld __cnfn convert_half4_rtn(ulong4);
	half4 __ovld __cnfn convert_half4_rtn(float4);
	half4 __ovld __cnfn convert_half4_rtn(half4);
	half4 __ovld __cnfn convert_half4_rtz(char4);
	half4 __ovld __cnfn convert_half4_rtz(uchar4);
	half4 __ovld __cnfn convert_half4_rtz(short4);
	half4 __ovld __cnfn convert_half4_rtz(ushort4);
	half4 __ovld __cnfn convert_half4_rtz(int4);
	half4 __ovld __cnfn convert_half4_rtz(uint4);
	half4 __ovld __cnfn convert_half4_rtz(long4);
	half4 __ovld __cnfn convert_half4_rtz(ulong4);
	half4 __ovld __cnfn convert_half4_rtz(float4);
	half4 __ovld __cnfn convert_half4_rtz(half4);
	half8 __ovld __cnfn convert_half8(char8);
	half8 __ovld __cnfn convert_half8(uchar8);
	half8 __ovld __cnfn convert_half8(short8);
	half8 __ovld __cnfn convert_half8(ushort8);
	half8 __ovld __cnfn convert_half8(int8);
	half8 __ovld __cnfn convert_half8(uint8);
	half8 __ovld __cnfn convert_half8(long8);
	half8 __ovld __cnfn convert_half8(ulong8);
	half8 __ovld __cnfn convert_half8(float8);
	half8 __ovld __cnfn convert_half8(half8);
	half8 __ovld __cnfn convert_half8_rte(char8);
	half8 __ovld __cnfn convert_half8_rte(uchar8);
	half8 __ovld __cnfn convert_half8_rte(short8);
	half8 __ovld __cnfn convert_half8_rte(ushort8);
	half8 __ovld __cnfn convert_half8_rte(int8);
	half8 __ovld __cnfn convert_half8_rte(uint8);
	half8 __ovld __cnfn convert_half8_rte(long8);
	half8 __ovld __cnfn convert_half8_rte(ulong8);
	half8 __ovld __cnfn convert_half8_rte(float8);
	half8 __ovld __cnfn convert_half8_rte(half8);
	half8 __ovld __cnfn convert_half8_rtp(char8);
	half8 __ovld __cnfn convert_half8_rtp(uchar8);
	half8 __ovld __cnfn convert_half8_rtp(short8);
	half8 __ovld __cnfn convert_half8_rtp(ushort8);
	half8 __ovld __cnfn convert_half8_rtp(int8);
	half8 __ovld __cnfn convert_half8_rtp(uint8);
	half8 __ovld __cnfn convert_half8_rtp(long8);
	half8 __ovld __cnfn convert_half8_rtp(ulong8);
	half8 __ovld __cnfn convert_half8_rtp(float8);
	half8 __ovld __cnfn convert_half8_rtp(half8);
	half8 __ovld __cnfn convert_half8_rtn(char8);
	half8 __ovld __cnfn convert_half8_rtn(uchar8);
	half8 __ovld __cnfn convert_half8_rtn(short8);
	half8 __ovld __cnfn convert_half8_rtn(ushort8);
	half8 __ovld __cnfn convert_half8_rtn(int8);
	half8 __ovld __cnfn convert_half8_rtn(uint8);
	half8 __ovld __cnfn convert_half8_rtn(long8);
	half8 __ovld __cnfn convert_half8_rtn(ulong8);
	half8 __ovld __cnfn convert_half8_rtn(float8);
	half8 __ovld __cnfn convert_half8_rtn(half8);
	half8 __ovld __cnfn convert_half8_rtz(char8);
	half8 __ovld __cnfn convert_half8_rtz(uchar8);
	half8 __ovld __cnfn convert_half8_rtz(short8);
	half8 __ovld __cnfn convert_half8_rtz(ushort8);
	half8 __ovld __cnfn convert_half8_rtz(int8);
	half8 __ovld __cnfn convert_half8_rtz(uint8);
	half8 __ovld __cnfn convert_half8_rtz(long8);
	half8 __ovld __cnfn convert_half8_rtz(ulong8);
	half8 __ovld __cnfn convert_half8_rtz(float8);
	half8 __ovld __cnfn convert_half8_rtz(half8);
	half16 __ovld __cnfn convert_half16(char16);
	half16 __ovld __cnfn convert_half16(uchar16);
	half16 __ovld __cnfn convert_half16(short16);
	half16 __ovld __cnfn convert_half16(ushort16);
	half16 __ovld __cnfn convert_half16(int16);
	half16 __ovld __cnfn convert_half16(uint16);
	half16 __ovld __cnfn convert_half16(long16);
	half16 __ovld __cnfn convert_half16(ulong16);
	half16 __ovld __cnfn convert_half16(float16);
	half16 __ovld __cnfn convert_half16(half16);
	half16 __ovld __cnfn convert_half16_rte(char16);
	half16 __ovld __cnfn convert_half16_rte(uchar16);
	half16 __ovld __cnfn convert_half16_rte(short16);
	half16 __ovld __cnfn convert_half16_rte(ushort16);
	half16 __ovld __cnfn convert_half16_rte(int16);
	half16 __ovld __cnfn convert_half16_rte(uint16);
	half16 __ovld __cnfn convert_half16_rte(long16);
	half16 __ovld __cnfn convert_half16_rte(ulong16);
	half16 __ovld __cnfn convert_half16_rte(float16);
	half16 __ovld __cnfn convert_half16_rte(half16);
	half16 __ovld __cnfn convert_half16_rtp(char16);
	half16 __ovld __cnfn convert_half16_rtp(uchar16);
	half16 __ovld __cnfn convert_half16_rtp(short16);
	half16 __ovld __cnfn convert_half16_rtp(ushort16);
	half16 __ovld __cnfn convert_half16_rtp(int16);
	half16 __ovld __cnfn convert_half16_rtp(uint16);
	half16 __ovld __cnfn convert_half16_rtp(long16);
	half16 __ovld __cnfn convert_half16_rtp(ulong16);
	half16 __ovld __cnfn convert_half16_rtp(float16);
	half16 __ovld __cnfn convert_half16_rtp(half16);
	half16 __ovld __cnfn convert_half16_rtn(char16);
	half16 __ovld __cnfn convert_half16_rtn(uchar16);
	half16 __ovld __cnfn convert_half16_rtn(short16);
	half16 __ovld __cnfn convert_half16_rtn(ushort16);
	half16 __ovld __cnfn convert_half16_rtn(int16);
	half16 __ovld __cnfn convert_half16_rtn(uint16);
	half16 __ovld __cnfn convert_half16_rtn(long16);
	half16 __ovld __cnfn convert_half16_rtn(ulong16);
	half16 __ovld __cnfn convert_half16_rtn(float16);
	half16 __ovld __cnfn convert_half16_rtn(half16);
	half16 __ovld __cnfn convert_half16_rtz(char16);
	half16 __ovld __cnfn convert_half16_rtz(uchar16);
	half16 __ovld __cnfn convert_half16_rtz(short16);
	half16 __ovld __cnfn convert_half16_rtz(ushort16);
	half16 __ovld __cnfn convert_half16_rtz(int16);
	half16 __ovld __cnfn convert_half16_rtz(uint16);
	half16 __ovld __cnfn convert_half16_rtz(long16);
	half16 __ovld __cnfn convert_half16_rtz(ulong16);
	half16 __ovld __cnfn convert_half16_rtz(float16);
	half16 __ovld __cnfn convert_half16_rtz(half16);

	// Convert half types to double types.
	#ifdef cl_khr_fp64
	double __ovld __cnfn convert_double(half);
	double __ovld __cnfn convert_double_rte(half);
	double __ovld __cnfn convert_double_rtp(half);
	double __ovld __cnfn convert_double_rtn(half);
	double __ovld __cnfn convert_double_rtz(half);
	double2 __ovld __cnfn convert_double2(half2);
	double2 __ovld __cnfn convert_double2_rte(half2);
	double2 __ovld __cnfn convert_double2_rtp(half2);
	double2 __ovld __cnfn convert_double2_rtn(half2);
	double2 __ovld __cnfn convert_double2_rtz(half2);
	double3 __ovld __cnfn convert_double3(half3);
	double3 __ovld __cnfn convert_double3_rte(half3);
	double3 __ovld __cnfn convert_double3_rtp(half3);
	double3 __ovld __cnfn convert_double3_rtn(half3);
	double3 __ovld __cnfn convert_double3_rtz(half3);
	double4 __ovld __cnfn convert_double4(half4);
	double4 __ovld __cnfn convert_double4_rte(half4);
	double4 __ovld __cnfn convert_double4_rtp(half4);
	double4 __ovld __cnfn convert_double4_rtn(half4);
	double4 __ovld __cnfn convert_double4_rtz(half4);
	double8 __ovld __cnfn convert_double8(half8);
	double8 __ovld __cnfn convert_double8_rte(half8);
	double8 __ovld __cnfn convert_double8_rtp(half8);
	double8 __ovld __cnfn convert_double8_rtn(half8);
	double8 __ovld __cnfn convert_double8_rtz(half8);
	double16 __ovld __cnfn convert_double16(half16);
	double16 __ovld __cnfn convert_double16_rte(half16);
	double16 __ovld __cnfn convert_double16_rtp(half16);
	double16 __ovld __cnfn convert_double16_rtn(half16);
	double16 __ovld __cnfn convert_double16_rtz(half16);

	// Convert double types to half types.
	half __ovld __cnfn convert_half(double);
	half __ovld __cnfn convert_half_rte(double);
	half __ovld __cnfn convert_half_rtp(double);
	half __ovld __cnfn convert_half_rtn(double);
	half __ovld __cnfn convert_half_rtz(double);
	half2 __ovld __cnfn convert_half2(double2);
	half2 __ovld __cnfn convert_half2_rte(double2);
	half2 __ovld __cnfn convert_half2_rtp(double2);
	half2 __ovld __cnfn convert_half2_rtn(double2);
	half2 __ovld __cnfn convert_half2_rtz(double2);
	half3 __ovld __cnfn convert_half3(double3);
	half3 __ovld __cnfn convert_half3_rte(double3);
	half3 __ovld __cnfn convert_half3_rtp(double3);
	half3 __ovld __cnfn convert_half3_rtn(double3);
	half3 __ovld __cnfn convert_half3_rtz(double3);
	half4 __ovld __cnfn convert_half4(double4);
	half4 __ovld __cnfn convert_half4_rte(double4);
	half4 __ovld __cnfn convert_half4_rtp(double4);
	half4 __ovld __cnfn convert_half4_rtn(double4);
	half4 __ovld __cnfn convert_half4_rtz(double4);
	half8 __ovld __cnfn convert_half8(double8);
	half8 __ovld __cnfn convert_half8_rte(double8);
	half8 __ovld __cnfn convert_half8_rtp(double8);
	half8 __ovld __cnfn convert_half8_rtn(double8);
	half8 __ovld __cnfn convert_half8_rtz(double8);
	half16 __ovld __cnfn convert_half16(double16);
	half16 __ovld __cnfn convert_half16_rte(double16);
	half16 __ovld __cnfn convert_half16_rtp(double16);
	half16 __ovld __cnfn convert_half16_rtn(double16);
	half16 __ovld __cnfn convert_half16_rtz(double16);
	#endif //cl_khr_fp64

	#endif // cl_khr_fp16

	/**
	* OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
	* Reinterprets a data type as another data type of the same size
	*/
	#define as_char(x) __builtin_astype((x), char)
	#define as_char2(x) __builtin_astype((x), char2)
	#define as_char3(x) __builtin_astype((x), char3)
	#define as_char4(x) __builtin_astype((x), char4)
	#define as_char8(x) __builtin_astype((x), char8)
	#define as_char16(x) __builtin_astype((x), char16)

	#define as_uchar(x) __builtin_astype((x), uchar)
	#define as_uchar2(x) __builtin_astype((x), uchar2)
	#define as_uchar3(x) __builtin_astype((x), uchar3)
	#define as_uchar4(x) __builtin_astype((x), uchar4)
	#define as_uchar8(x) __builtin_astype((x), uchar8)
	#define as_uchar16(x) __builtin_astype((x), uchar16)

	#define as_short(x) __builtin_astype((x), short)
	#define as_short2(x) __builtin_astype((x), short2)
	#define as_short3(x) __builtin_astype((x), short3)
	#define as_short4(x) __builtin_astype((x), short4)
	#define as_short8(x) __builtin_astype((x), short8)
	#define as_short16(x) __builtin_astype((x), short16)

	#define as_ushort(x) __builtin_astype((x), ushort)
	#define as_ushort2(x) __builtin_astype((x), ushort2)
	#define as_ushort3(x) __builtin_astype((x), ushort3)
	#define as_ushort4(x) __builtin_astype((x), ushort4)
	#define as_ushort8(x) __builtin_astype((x), ushort8)
	#define as_ushort16(x) __builtin_astype((x), ushort16)

	#define as_int(x) __builtin_astype((x), int)
	#define as_int2(x) __builtin_astype((x), int2)
	#define as_int3(x) __builtin_astype((x), int3)
	#define as_int4(x) __builtin_astype((x), int4)
	#define as_int8(x) __builtin_astype((x), int8)
	#define as_int16(x) __builtin_astype((x), int16)

	#define as_uint(x) __builtin_astype((x), uint)
	#define as_uint2(x) __builtin_astype((x), uint2)
	#define as_uint3(x) __builtin_astype((x), uint3)
	#define as_uint4(x) __builtin_astype((x), uint4)
	#define as_uint8(x) __builtin_astype((x), uint8)
	#define as_uint16(x) __builtin_astype((x), uint16)

	#define as_long(x) __builtin_astype((x), long)
	#define as_long2(x) __builtin_astype((x), long2)
	#define as_long3(x) __builtin_astype((x), long3)
	#define as_long4(x) __builtin_astype((x), long4)
	#define as_long8(x) __builtin_astype((x), long8)
	#define as_long16(x) __builtin_astype((x), long16)

	#define as_ulong(x) __builtin_astype((x), ulong)
	#define as_ulong2(x) __builtin_astype((x), ulong2)
	#define as_ulong3(x) __builtin_astype((x), ulong3)
	#define as_ulong4(x) __builtin_astype((x), ulong4)
	#define as_ulong8(x) __builtin_astype((x), ulong8)
	#define as_ulong16(x) __builtin_astype((x), ulong16)

	#define as_float(x) __builtin_astype((x), float)
	#define as_float2(x) __builtin_astype((x), float2)
	#define as_float3(x) __builtin_astype((x), float3)
	#define as_float4(x) __builtin_astype((x), float4)
	#define as_float8(x) __builtin_astype((x), float8)
	#define as_float16(x) __builtin_astype((x), float16)

	#ifdef cl_khr_fp64
	#define as_double(x) __builtin_astype((x), double)
	#define as_double2(x) __builtin_astype((x), double2)
	#define as_double3(x) __builtin_astype((x), double3)
	#define as_double4(x) __builtin_astype((x), double4)
	#define as_double8(x) __builtin_astype((x), double8)
	#define as_double16(x) __builtin_astype((x), double16)
	#endif //cl_khr_fp64

	#ifdef cl_khr_fp16
	#define as_half(x) __builtin_astype((x), half)
	#define as_half2(x) __builtin_astype((x), half2)
	#define as_half3(x) __builtin_astype((x), half3)
	#define as_half4(x) __builtin_astype((x), half4)
	#define as_half8(x) __builtin_astype((x), half8)
	#define as_half16(x) __builtin_astype((x), half16)
	#endif //cl_khr_fp16

	// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers

	#define __kernel_exec(X, typen) __kernel \
	__attribute__((work_group_size_hint(X, 1, 1))) \
	__attribute__((vec_type_hint(typen)))

	#define kernel_exec(X, typen) __kernel \
	__attribute__((work_group_size_hint(X, 1, 1))) \
	__attribute__((vec_type_hint(typen)))

	// OpenCL v1.1 s6.11.1, v1.2 s6.12.1, v2.0 s6.13.1 - Work-item Functions

	/**
	* Returns the number of dimensions in use. This is the
	* value given to the work_dim argument specified in
	* clEnqueueNDRangeKernel.
	* For clEnqueueTask, this returns 1.
	*/
	uint __ovld __cnfn get_work_dim(void);

	/**
	* Returns the number of global work-items specified for
	* dimension identified by dimindx. This value is given by
	* the global_work_size argument to
	* clEnqueueNDRangeKernel. Valid values of dimindx
	* are 0 to get_work_dim() - 1. For other values of
	* dimindx, get_global_size() returns 1.
	* For clEnqueueTask, this always returns 1.
	*/
	size_t __ovld __cnfn get_global_size(uint dimindx);

	/**
	* Returns the unique global work-item ID value for
	* dimension identified by dimindx. The global work-item
	* ID specifies the work-item ID based on the number of
	* global work-items specified to execute the kernel. Valid
	* values of dimindx are 0 to get_work_dim() - 1. For
	* other values of dimindx, get_global_id() returns 0.
	* For clEnqueueTask, this returns 0.
	*/
	size_t __ovld __cnfn get_global_id(uint dimindx);

	/**
	* Returns the number of local work-items specified in
	* dimension identified by dimindx. This value is given by
	* the local_work_size argument to
	* clEnqueueNDRangeKernel if local_work_size is not
	* NULL; otherwise the OpenCL implementation chooses
	* an appropriate local_work_size value which is returned
	* by this function. Valid values of dimindx are 0 to
	* get_work_dim() - 1. For other values of dimindx,
	* get_local_size() returns 1.
	* For clEnqueueTask, this always returns 1.
	*/
	size_t __ovld __cnfn get_local_size(uint dimindx);

	/**
	* Returns the unique local work-item ID i.e. a work-item
	* within a specific work-group for dimension identified by
	* dimindx. Valid values of dimindx are 0 to
	* get_work_dim() - 1. For other values of dimindx,
	* get_local_id() returns 0.
	* For clEnqueueTask, this returns 0.
	*/
	size_t __ovld __cnfn get_local_id(uint dimindx);

	/**
	* Returns the number of work-groups that will execute a
	* kernel for dimension identified by dimindx.
	* Valid values of dimindx are 0 to get_work_dim() - 1.
	* For other values of dimindx, get_num_groups () returns
	* 1.
	* For clEnqueueTask, this always returns 1.
	*/
	size_t __ovld __cnfn get_num_groups(uint dimindx);

	/**
	* get_group_id returns the work-group ID which is a
	* number from 0 .. get_num_groups(dimindx) - 1.
	* Valid values of dimindx are 0 to get_work_dim() - 1.
	* For other values, get_group_id() returns 0.
	* For clEnqueueTask, this returns 0.
	*/
	size_t __ovld __cnfn get_group_id(uint dimindx);

	/**
	* get_global_offset returns the offset values specified in
	* global_work_offset argument to
	* clEnqueueNDRangeKernel.
	* Valid values of dimindx are 0 to get_work_dim() - 1.
	* For other values, get_global_offset() returns 0.
	* For clEnqueueTask, this returns 0.
	*/
	size_t __ovld __cnfn get_global_offset(uint dimindx);

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	size_t __ovld get_enqueued_local_size(uint dimindx);
	size_t __ovld get_global_linear_id(void);
	size_t __ovld get_local_linear_id(void);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// OpenCL v1.1 s6.11.2, v1.2 s6.12.2, v2.0 s6.13.2 - Math functions

	/**
	* Arc cosine function.
	*/
	float __ovld __cnfn acos(float);
	float2 __ovld __cnfn acos(float2);
	float3 __ovld __cnfn acos(float3);
	float4 __ovld __cnfn acos(float4);
	float8 __ovld __cnfn acos(float8);
	float16 __ovld __cnfn acos(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn acos(double);
	double2 __ovld __cnfn acos(double2);
	double3 __ovld __cnfn acos(double3);
	double4 __ovld __cnfn acos(double4);
	double8 __ovld __cnfn acos(double8);
	double16 __ovld __cnfn acos(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn acos(half);
	half2 __ovld __cnfn acos(half2);
	half3 __ovld __cnfn acos(half3);
	half4 __ovld __cnfn acos(half4);
	half8 __ovld __cnfn acos(half8);
	half16 __ovld __cnfn acos(half16);
	#endif //cl_khr_fp16

	/**
	* Inverse hyperbolic cosine.
	*/
	float __ovld __cnfn acosh(float);
	float2 __ovld __cnfn acosh(float2);
	float3 __ovld __cnfn acosh(float3);
	float4 __ovld __cnfn acosh(float4);
	float8 __ovld __cnfn acosh(float8);
	float16 __ovld __cnfn acosh(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn acosh(double);
	double2 __ovld __cnfn acosh(double2);
	double3 __ovld __cnfn acosh(double3);
	double4 __ovld __cnfn acosh(double4);
	double8 __ovld __cnfn acosh(double8);
	double16 __ovld __cnfn acosh(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn acosh(half);
	half2 __ovld __cnfn acosh(half2);
	half3 __ovld __cnfn acosh(half3);
	half4 __ovld __cnfn acosh(half4);
	half8 __ovld __cnfn acosh(half8);
	half16 __ovld __cnfn acosh(half16);
	#endif //cl_khr_fp16

	/**
	* Compute acos (x) / PI.
	*/
	float __ovld __cnfn acospi(float x);
	float2 __ovld __cnfn acospi(float2 x);
	float3 __ovld __cnfn acospi(float3 x);
	float4 __ovld __cnfn acospi(float4 x);
	float8 __ovld __cnfn acospi(float8 x);
	float16 __ovld __cnfn acospi(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn acospi(double x);
	double2 __ovld __cnfn acospi(double2 x);
	double3 __ovld __cnfn acospi(double3 x);
	double4 __ovld __cnfn acospi(double4 x);
	double8 __ovld __cnfn acospi(double8 x);
	double16 __ovld __cnfn acospi(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn acospi(half x);
	half2 __ovld __cnfn acospi(half2 x);
	half3 __ovld __cnfn acospi(half3 x);
	half4 __ovld __cnfn acospi(half4 x);
	half8 __ovld __cnfn acospi(half8 x);
	half16 __ovld __cnfn acospi(half16 x);
	#endif //cl_khr_fp16

	/**
	* Arc sine function.
	*/
	float __ovld __cnfn asin(float);
	float2 __ovld __cnfn asin(float2);
	float3 __ovld __cnfn asin(float3);
	float4 __ovld __cnfn asin(float4);
	float8 __ovld __cnfn asin(float8);
	float16 __ovld __cnfn asin(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn asin(double);
	double2 __ovld __cnfn asin(double2);
	double3 __ovld __cnfn asin(double3);
	double4 __ovld __cnfn asin(double4);
	double8 __ovld __cnfn asin(double8);
	double16 __ovld __cnfn asin(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn asin(half);
	half2 __ovld __cnfn asin(half2);
	half3 __ovld __cnfn asin(half3);
	half4 __ovld __cnfn asin(half4);
	half8 __ovld __cnfn asin(half8);
	half16 __ovld __cnfn asin(half16);
	#endif //cl_khr_fp16

	/**
	* Inverse hyperbolic sine.
	*/
	float __ovld __cnfn asinh(float);
	float2 __ovld __cnfn asinh(float2);
	float3 __ovld __cnfn asinh(float3);
	float4 __ovld __cnfn asinh(float4);
	float8 __ovld __cnfn asinh(float8);
	float16 __ovld __cnfn asinh(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn asinh(double);
	double2 __ovld __cnfn asinh(double2);
	double3 __ovld __cnfn asinh(double3);
	double4 __ovld __cnfn asinh(double4);
	double8 __ovld __cnfn asinh(double8);
	double16 __ovld __cnfn asinh(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn asinh(half);
	half2 __ovld __cnfn asinh(half2);
	half3 __ovld __cnfn asinh(half3);
	half4 __ovld __cnfn asinh(half4);
	half8 __ovld __cnfn asinh(half8);
	half16 __ovld __cnfn asinh(half16);
	#endif //cl_khr_fp16

	/**
	* Compute asin (x) / PI.
	*/
	float __ovld __cnfn asinpi(float x);
	float2 __ovld __cnfn asinpi(float2 x);
	float3 __ovld __cnfn asinpi(float3 x);
	float4 __ovld __cnfn asinpi(float4 x);
	float8 __ovld __cnfn asinpi(float8 x);
	float16 __ovld __cnfn asinpi(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn asinpi(double x);
	double2 __ovld __cnfn asinpi(double2 x);
	double3 __ovld __cnfn asinpi(double3 x);
	double4 __ovld __cnfn asinpi(double4 x);
	double8 __ovld __cnfn asinpi(double8 x);
	double16 __ovld __cnfn asinpi(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn asinpi(half x);
	half2 __ovld __cnfn asinpi(half2 x);
	half3 __ovld __cnfn asinpi(half3 x);
	half4 __ovld __cnfn asinpi(half4 x);
	half8 __ovld __cnfn asinpi(half8 x);
	half16 __ovld __cnfn asinpi(half16 x);
	#endif //cl_khr_fp16

	/**
	* Arc tangent function.
	*/
	float __ovld __cnfn atan(float y_over_x);
	float2 __ovld __cnfn atan(float2 y_over_x);
	float3 __ovld __cnfn atan(float3 y_over_x);
	float4 __ovld __cnfn atan(float4 y_over_x);
	float8 __ovld __cnfn atan(float8 y_over_x);
	float16 __ovld __cnfn atan(float16 y_over_x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn atan(double y_over_x);
	double2 __ovld __cnfn atan(double2 y_over_x);
	double3 __ovld __cnfn atan(double3 y_over_x);
	double4 __ovld __cnfn atan(double4 y_over_x);
	double8 __ovld __cnfn atan(double8 y_over_x);
	double16 __ovld __cnfn atan(double16 y_over_x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn atan(half y_over_x);
	half2 __ovld __cnfn atan(half2 y_over_x);
	half3 __ovld __cnfn atan(half3 y_over_x);
	half4 __ovld __cnfn atan(half4 y_over_x);
	half8 __ovld __cnfn atan(half8 y_over_x);
	half16 __ovld __cnfn atan(half16 y_over_x);
	#endif //cl_khr_fp16

	/**
	* Arc tangent of y / x.
	*/
	float __ovld __cnfn atan2(float y, float x);
	float2 __ovld __cnfn atan2(float2 y, float2 x);
	float3 __ovld __cnfn atan2(float3 y, float3 x);
	float4 __ovld __cnfn atan2(float4 y, float4 x);
	float8 __ovld __cnfn atan2(float8 y, float8 x);
	float16 __ovld __cnfn atan2(float16 y, float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn atan2(double y, double x);
	double2 __ovld __cnfn atan2(double2 y, double2 x);
	double3 __ovld __cnfn atan2(double3 y, double3 x);
	double4 __ovld __cnfn atan2(double4 y, double4 x);
	double8 __ovld __cnfn atan2(double8 y, double8 x);
	double16 __ovld __cnfn atan2(double16 y, double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn atan2(half y, half x);
	half2 __ovld __cnfn atan2(half2 y, half2 x);
	half3 __ovld __cnfn atan2(half3 y, half3 x);
	half4 __ovld __cnfn atan2(half4 y, half4 x);
	half8 __ovld __cnfn atan2(half8 y, half8 x);
	half16 __ovld __cnfn atan2(half16 y, half16 x);
	#endif //cl_khr_fp16

	/**
	* Hyperbolic arc tangent.
	*/
	float __ovld __cnfn atanh(float);
	float2 __ovld __cnfn atanh(float2);
	float3 __ovld __cnfn atanh(float3);
	float4 __ovld __cnfn atanh(float4);
	float8 __ovld __cnfn atanh(float8);
	float16 __ovld __cnfn atanh(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn atanh(double);
	double2 __ovld __cnfn atanh(double2);
	double3 __ovld __cnfn atanh(double3);
	double4 __ovld __cnfn atanh(double4);
	double8 __ovld __cnfn atanh(double8);
	double16 __ovld __cnfn atanh(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn atanh(half);
	half2 __ovld __cnfn atanh(half2);
	half3 __ovld __cnfn atanh(half3);
	half4 __ovld __cnfn atanh(half4);
	half8 __ovld __cnfn atanh(half8);
	half16 __ovld __cnfn atanh(half16);
	#endif //cl_khr_fp16

	/**
	* Compute atan (x) / PI.
	*/
	float __ovld __cnfn atanpi(float x);
	float2 __ovld __cnfn atanpi(float2 x);
	float3 __ovld __cnfn atanpi(float3 x);
	float4 __ovld __cnfn atanpi(float4 x);
	float8 __ovld __cnfn atanpi(float8 x);
	float16 __ovld __cnfn atanpi(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn atanpi(double x);
	double2 __ovld __cnfn atanpi(double2 x);
	double3 __ovld __cnfn atanpi(double3 x);
	double4 __ovld __cnfn atanpi(double4 x);
	double8 __ovld __cnfn atanpi(double8 x);
	double16 __ovld __cnfn atanpi(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn atanpi(half x);
	half2 __ovld __cnfn atanpi(half2 x);
	half3 __ovld __cnfn atanpi(half3 x);
	half4 __ovld __cnfn atanpi(half4 x);
	half8 __ovld __cnfn atanpi(half8 x);
	half16 __ovld __cnfn atanpi(half16 x);
	#endif //cl_khr_fp16

	/**
	* Compute atan2 (y, x) / PI.
	*/
	float __ovld __cnfn atan2pi(float y, float x);
	float2 __ovld __cnfn atan2pi(float2 y, float2 x);
	float3 __ovld __cnfn atan2pi(float3 y, float3 x);
	float4 __ovld __cnfn atan2pi(float4 y, float4 x);
	float8 __ovld __cnfn atan2pi(float8 y, float8 x);
	float16 __ovld __cnfn atan2pi(float16 y, float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn atan2pi(double y, double x);
	double2 __ovld __cnfn atan2pi(double2 y, double2 x);
	double3 __ovld __cnfn atan2pi(double3 y, double3 x);
	double4 __ovld __cnfn atan2pi(double4 y, double4 x);
	double8 __ovld __cnfn atan2pi(double8 y, double8 x);
	double16 __ovld __cnfn atan2pi(double16 y, double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn atan2pi(half y, half x);
	half2 __ovld __cnfn atan2pi(half2 y, half2 x);
	half3 __ovld __cnfn atan2pi(half3 y, half3 x);
	half4 __ovld __cnfn atan2pi(half4 y, half4 x);
	half8 __ovld __cnfn atan2pi(half8 y, half8 x);
	half16 __ovld __cnfn atan2pi(half16 y, half16 x);
	#endif //cl_khr_fp16

	/**
	* Compute cube-root.
	*/
	float __ovld __cnfn cbrt(float);
	float2 __ovld __cnfn cbrt(float2);
	float3 __ovld __cnfn cbrt(float3);
	float4 __ovld __cnfn cbrt(float4);
	float8 __ovld __cnfn cbrt(float8);
	float16 __ovld __cnfn cbrt(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn cbrt(double);
	double2 __ovld __cnfn cbrt(double2);
	double3 __ovld __cnfn cbrt(double3);
	double4 __ovld __cnfn cbrt(double4);
	double8 __ovld __cnfn cbrt(double8);
	double16 __ovld __cnfn cbrt(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn cbrt(half);
	half2 __ovld __cnfn cbrt(half2);
	half3 __ovld __cnfn cbrt(half3);
	half4 __ovld __cnfn cbrt(half4);
	half8 __ovld __cnfn cbrt(half8);
	half16 __ovld __cnfn cbrt(half16);
	#endif //cl_khr_fp16

	/**
	* Round to integral value using the round to positive
	* infinity rounding mode.
	*/
	float __ovld __cnfn ceil(float);
	float2 __ovld __cnfn ceil(float2);
	float3 __ovld __cnfn ceil(float3);
	float4 __ovld __cnfn ceil(float4);
	float8 __ovld __cnfn ceil(float8);
	float16 __ovld __cnfn ceil(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn ceil(double);
	double2 __ovld __cnfn ceil(double2);
	double3 __ovld __cnfn ceil(double3);
	double4 __ovld __cnfn ceil(double4);
	double8 __ovld __cnfn ceil(double8);
	double16 __ovld __cnfn ceil(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn ceil(half);
	half2 __ovld __cnfn ceil(half2);
	half3 __ovld __cnfn ceil(half3);
	half4 __ovld __cnfn ceil(half4);
	half8 __ovld __cnfn ceil(half8);
	half16 __ovld __cnfn ceil(half16);
	#endif //cl_khr_fp16

	/**
	* Returns x with its sign changed to match the sign of y.
	*/
	float __ovld __cnfn copysign(float x, float y);
	float2 __ovld __cnfn copysign(float2 x, float2 y);
	float3 __ovld __cnfn copysign(float3 x, float3 y);
	float4 __ovld __cnfn copysign(float4 x, float4 y);
	float8 __ovld __cnfn copysign(float8 x, float8 y);
	float16 __ovld __cnfn copysign(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn copysign(double x, double y);
	double2 __ovld __cnfn copysign(double2 x, double2 y);
	double3 __ovld __cnfn copysign(double3 x, double3 y);
	double4 __ovld __cnfn copysign(double4 x, double4 y);
	double8 __ovld __cnfn copysign(double8 x, double8 y);
	double16 __ovld __cnfn copysign(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn copysign(half x, half y);
	half2 __ovld __cnfn copysign(half2 x, half2 y);
	half3 __ovld __cnfn copysign(half3 x, half3 y);
	half4 __ovld __cnfn copysign(half4 x, half4 y);
	half8 __ovld __cnfn copysign(half8 x, half8 y);
	half16 __ovld __cnfn copysign(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Compute cosine.
	*/
	float __ovld __cnfn cos(float);
	float2 __ovld __cnfn cos(float2);
	float3 __ovld __cnfn cos(float3);
	float4 __ovld __cnfn cos(float4);
	float8 __ovld __cnfn cos(float8);
	float16 __ovld __cnfn cos(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn cos(double);
	double2 __ovld __cnfn cos(double2);
	double3 __ovld __cnfn cos(double3);
	double4 __ovld __cnfn cos(double4);
	double8 __ovld __cnfn cos(double8);
	double16 __ovld __cnfn cos(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn cos(half);
	half2 __ovld __cnfn cos(half2);
	half3 __ovld __cnfn cos(half3);
	half4 __ovld __cnfn cos(half4);
	half8 __ovld __cnfn cos(half8);
	half16 __ovld __cnfn cos(half16);
	#endif //cl_khr_fp16

	/**
	* Compute hyperbolic cosine.
	*/
	float __ovld __cnfn cosh(float);
	float2 __ovld __cnfn cosh(float2);
	float3 __ovld __cnfn cosh(float3);
	float4 __ovld __cnfn cosh(float4);
	float8 __ovld __cnfn cosh(float8);
	float16 __ovld __cnfn cosh(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn cosh(double);
	double2 __ovld __cnfn cosh(double2);
	double3 __ovld __cnfn cosh(double3);
	double4 __ovld __cnfn cosh(double4);
	double8 __ovld __cnfn cosh(double8);
	double16 __ovld __cnfn cosh(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn cosh(half);
	half2 __ovld __cnfn cosh(half2);
	half3 __ovld __cnfn cosh(half3);
	half4 __ovld __cnfn cosh(half4);
	half8 __ovld __cnfn cosh(half8);
	half16 __ovld __cnfn cosh(half16);
	#endif //cl_khr_fp16

	/**
	* Compute cos (PI * x).
	*/
	float __ovld __cnfn cospi(float x);
	float2 __ovld __cnfn cospi(float2 x);
	float3 __ovld __cnfn cospi(float3 x);
	float4 __ovld __cnfn cospi(float4 x);
	float8 __ovld __cnfn cospi(float8 x);
	float16 __ovld __cnfn cospi(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn cospi(double x);
	double2 __ovld __cnfn cospi(double2 x);
	double3 __ovld __cnfn cospi(double3 x);
	double4 __ovld __cnfn cospi(double4 x);
	double8 __ovld __cnfn cospi(double8 x);
	double16 __ovld __cnfn cospi(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn cospi(half x);
	half2 __ovld __cnfn cospi(half2 x);
	half3 __ovld __cnfn cospi(half3 x);
	half4 __ovld __cnfn cospi(half4 x);
	half8 __ovld __cnfn cospi(half8 x);
	half16 __ovld __cnfn cospi(half16 x);
	#endif //cl_khr_fp16

	/**
	* Complementary error function.
	*/
	float __ovld __cnfn erfc(float);
	float2 __ovld __cnfn erfc(float2);
	float3 __ovld __cnfn erfc(float3);
	float4 __ovld __cnfn erfc(float4);
	float8 __ovld __cnfn erfc(float8);
	float16 __ovld __cnfn erfc(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn erfc(double);
	double2 __ovld __cnfn erfc(double2);
	double3 __ovld __cnfn erfc(double3);
	double4 __ovld __cnfn erfc(double4);
	double8 __ovld __cnfn erfc(double8);
	double16 __ovld __cnfn erfc(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn erfc(half);
	half2 __ovld __cnfn erfc(half2);
	half3 __ovld __cnfn erfc(half3);
	half4 __ovld __cnfn erfc(half4);
	half8 __ovld __cnfn erfc(half8);
	half16 __ovld __cnfn erfc(half16);
	#endif //cl_khr_fp16

	/**
	* Error function encountered in integrating the
	* normal distribution.
	*/
	float __ovld __cnfn erf(float);
	float2 __ovld __cnfn erf(float2);
	float3 __ovld __cnfn erf(float3);
	float4 __ovld __cnfn erf(float4);
	float8 __ovld __cnfn erf(float8);
	float16 __ovld __cnfn erf(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn erf(double);
	double2 __ovld __cnfn erf(double2);
	double3 __ovld __cnfn erf(double3);
	double4 __ovld __cnfn erf(double4);
	double8 __ovld __cnfn erf(double8);
	double16 __ovld __cnfn erf(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn erf(half);
	half2 __ovld __cnfn erf(half2);
	half3 __ovld __cnfn erf(half3);
	half4 __ovld __cnfn erf(half4);
	half8 __ovld __cnfn erf(half8);
	half16 __ovld __cnfn erf(half16);
	#endif //cl_khr_fp16

	/**
	* Compute the base e exponential function of x.
	*/
	float __ovld __cnfn exp(float x);
	float2 __ovld __cnfn exp(float2 x);
	float3 __ovld __cnfn exp(float3 x);
	float4 __ovld __cnfn exp(float4 x);
	float8 __ovld __cnfn exp(float8 x);
	float16 __ovld __cnfn exp(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn exp(double x);
	double2 __ovld __cnfn exp(double2 x);
	double3 __ovld __cnfn exp(double3 x);
	double4 __ovld __cnfn exp(double4 x);
	double8 __ovld __cnfn exp(double8 x);
	double16 __ovld __cnfn exp(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn exp(half x);
	half2 __ovld __cnfn exp(half2 x);
	half3 __ovld __cnfn exp(half3 x);
	half4 __ovld __cnfn exp(half4 x);
	half8 __ovld __cnfn exp(half8 x);
	half16 __ovld __cnfn exp(half16 x);
	#endif //cl_khr_fp16

	/**
	* Exponential base 2 function.
	*/
	float __ovld __cnfn exp2(float);
	float2 __ovld __cnfn exp2(float2);
	float3 __ovld __cnfn exp2(float3);
	float4 __ovld __cnfn exp2(float4);
	float8 __ovld __cnfn exp2(float8);
	float16 __ovld __cnfn exp2(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn exp2(double);
	double2 __ovld __cnfn exp2(double2);
	double3 __ovld __cnfn exp2(double3);
	double4 __ovld __cnfn exp2(double4);
	double8 __ovld __cnfn exp2(double8);
	double16 __ovld __cnfn exp2(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn exp2(half);
	half2 __ovld __cnfn exp2(half2);
	half3 __ovld __cnfn exp2(half3);
	half4 __ovld __cnfn exp2(half4);
	half8 __ovld __cnfn exp2(half8);
	half16 __ovld __cnfn exp2(half16);
	#endif //cl_khr_fp16

	/**
	* Exponential base 10 function.
	*/
	float __ovld __cnfn exp10(float);
	float2 __ovld __cnfn exp10(float2);
	float3 __ovld __cnfn exp10(float3);
	float4 __ovld __cnfn exp10(float4);
	float8 __ovld __cnfn exp10(float8);
	float16 __ovld __cnfn exp10(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn exp10(double);
	double2 __ovld __cnfn exp10(double2);
	double3 __ovld __cnfn exp10(double3);
	double4 __ovld __cnfn exp10(double4);
	double8 __ovld __cnfn exp10(double8);
	double16 __ovld __cnfn exp10(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn exp10(half);
	half2 __ovld __cnfn exp10(half2);
	half3 __ovld __cnfn exp10(half3);
	half4 __ovld __cnfn exp10(half4);
	half8 __ovld __cnfn exp10(half8);
	half16 __ovld __cnfn exp10(half16);
	#endif //cl_khr_fp16

	/**
	* Compute e^x- 1.0.
	*/
	float __ovld __cnfn expm1(float x);
	float2 __ovld __cnfn expm1(float2 x);
	float3 __ovld __cnfn expm1(float3 x);
	float4 __ovld __cnfn expm1(float4 x);
	float8 __ovld __cnfn expm1(float8 x);
	float16 __ovld __cnfn expm1(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn expm1(double x);
	double2 __ovld __cnfn expm1(double2 x);
	double3 __ovld __cnfn expm1(double3 x);
	double4 __ovld __cnfn expm1(double4 x);
	double8 __ovld __cnfn expm1(double8 x);
	double16 __ovld __cnfn expm1(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn expm1(half x);
	half2 __ovld __cnfn expm1(half2 x);
	half3 __ovld __cnfn expm1(half3 x);
	half4 __ovld __cnfn expm1(half4 x);
	half8 __ovld __cnfn expm1(half8 x);
	half16 __ovld __cnfn expm1(half16 x);
	#endif //cl_khr_fp16

	/**
	* Compute absolute value of a floating-point number.
	*/
	float __ovld __cnfn fabs(float);
	float2 __ovld __cnfn fabs(float2);
	float3 __ovld __cnfn fabs(float3);
	float4 __ovld __cnfn fabs(float4);
	float8 __ovld __cnfn fabs(float8);
	float16 __ovld __cnfn fabs(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn fabs(double);
	double2 __ovld __cnfn fabs(double2);
	double3 __ovld __cnfn fabs(double3);
	double4 __ovld __cnfn fabs(double4);
	double8 __ovld __cnfn fabs(double8);
	double16 __ovld __cnfn fabs(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn fabs(half);
	half2 __ovld __cnfn fabs(half2);
	half3 __ovld __cnfn fabs(half3);
	half4 __ovld __cnfn fabs(half4);
	half8 __ovld __cnfn fabs(half8);
	half16 __ovld __cnfn fabs(half16);
	#endif //cl_khr_fp16

	/**
	* x - y if x > y, +0 if x is less than or equal to y.
	*/
	float __ovld __cnfn fdim(float x, float y);
	float2 __ovld __cnfn fdim(float2 x, float2 y);
	float3 __ovld __cnfn fdim(float3 x, float3 y);
	float4 __ovld __cnfn fdim(float4 x, float4 y);
	float8 __ovld __cnfn fdim(float8 x, float8 y);
	float16 __ovld __cnfn fdim(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn fdim(double x, double y);
	double2 __ovld __cnfn fdim(double2 x, double2 y);
	double3 __ovld __cnfn fdim(double3 x, double3 y);
	double4 __ovld __cnfn fdim(double4 x, double4 y);
	double8 __ovld __cnfn fdim(double8 x, double8 y);
	double16 __ovld __cnfn fdim(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn fdim(half x, half y);
	half2 __ovld __cnfn fdim(half2 x, half2 y);
	half3 __ovld __cnfn fdim(half3 x, half3 y);
	half4 __ovld __cnfn fdim(half4 x, half4 y);
	half8 __ovld __cnfn fdim(half8 x, half8 y);
	half16 __ovld __cnfn fdim(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Round to integral value using the round to -ve
	* infinity rounding mode.
	*/
	float __ovld __cnfn floor(float);
	float2 __ovld __cnfn floor(float2);
	float3 __ovld __cnfn floor(float3);
	float4 __ovld __cnfn floor(float4);
	float8 __ovld __cnfn floor(float8);
	float16 __ovld __cnfn floor(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn floor(double);
	double2 __ovld __cnfn floor(double2);
	double3 __ovld __cnfn floor(double3);
	double4 __ovld __cnfn floor(double4);
	double8 __ovld __cnfn floor(double8);
	double16 __ovld __cnfn floor(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn floor(half);
	half2 __ovld __cnfn floor(half2);
	half3 __ovld __cnfn floor(half3);
	half4 __ovld __cnfn floor(half4);
	half8 __ovld __cnfn floor(half8);
	half16 __ovld __cnfn floor(half16);
	#endif //cl_khr_fp16

	/**
	* Returns the correctly rounded floating-point
	* representation of the sum of c with the infinitely
	* precise product of a and b. Rounding of
	* intermediate products shall not occur. Edge case
	* behavior is per the IEEE 754-2008 standard.
	*/
	float __ovld __cnfn fma(float a, float b, float c);
	float2 __ovld __cnfn fma(float2 a, float2 b, float2 c);
	float3 __ovld __cnfn fma(float3 a, float3 b, float3 c);
	float4 __ovld __cnfn fma(float4 a, float4 b, float4 c);
	float8 __ovld __cnfn fma(float8 a, float8 b, float8 c);
	float16 __ovld __cnfn fma(float16 a, float16 b, float16 c);
	#ifdef cl_khr_fp64
	double __ovld __cnfn fma(double a, double b, double c);
	double2 __ovld __cnfn fma(double2 a, double2 b, double2 c);
	double3 __ovld __cnfn fma(double3 a, double3 b, double3 c);
	double4 __ovld __cnfn fma(double4 a, double4 b, double4 c);
	double8 __ovld __cnfn fma(double8 a, double8 b, double8 c);
	double16 __ovld __cnfn fma(double16 a, double16 b, double16 c);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn fma(half a, half b, half c);
	half2 __ovld __cnfn fma(half2 a, half2 b, half2 c);
	half3 __ovld __cnfn fma(half3 a, half3 b, half3 c);
	half4 __ovld __cnfn fma(half4 a, half4 b, half4 c);
	half8 __ovld __cnfn fma(half8 a, half8 b, half8 c);
	half16 __ovld __cnfn fma(half16 a, half16 b, half16 c);
	#endif //cl_khr_fp16

	/**
	* Returns y if x < y, otherwise it returns x. If one
	* argument is a NaN, fmax() returns the other
	* argument. If both arguments are NaNs, fmax()
	* returns a NaN.
	*/
	float __ovld __cnfn fmax(float x, float y);
	float2 __ovld __cnfn fmax(float2 x, float2 y);
	float3 __ovld __cnfn fmax(float3 x, float3 y);
	float4 __ovld __cnfn fmax(float4 x, float4 y);
	float8 __ovld __cnfn fmax(float8 x, float8 y);
	float16 __ovld __cnfn fmax(float16 x, float16 y);
	float2 __ovld __cnfn fmax(float2 x, float y);
	float3 __ovld __cnfn fmax(float3 x, float y);
	float4 __ovld __cnfn fmax(float4 x, float y);
	float8 __ovld __cnfn fmax(float8 x, float y);
	float16 __ovld __cnfn fmax(float16 x, float y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn fmax(double x, double y);
	double2 __ovld __cnfn fmax(double2 x, double2 y);
	double3 __ovld __cnfn fmax(double3 x, double3 y);
	double4 __ovld __cnfn fmax(double4 x, double4 y);
	double8 __ovld __cnfn fmax(double8 x, double8 y);
	double16 __ovld __cnfn fmax(double16 x, double16 y);
	double2 __ovld __cnfn fmax(double2 x, double y);
	double3 __ovld __cnfn fmax(double3 x, double y);
	double4 __ovld __cnfn fmax(double4 x, double y);
	double8 __ovld __cnfn fmax(double8 x, double y);
	double16 __ovld __cnfn fmax(double16 x, double y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn fmax(half x, half y);
	half2 __ovld __cnfn fmax(half2 x, half2 y);
	half3 __ovld __cnfn fmax(half3 x, half3 y);
	half4 __ovld __cnfn fmax(half4 x, half4 y);
	half8 __ovld __cnfn fmax(half8 x, half8 y);
	half16 __ovld __cnfn fmax(half16 x, half16 y);
	half2 __ovld __cnfn fmax(half2 x, half y);
	half3 __ovld __cnfn fmax(half3 x, half y);
	half4 __ovld __cnfn fmax(half4 x, half y);
	half8 __ovld __cnfn fmax(half8 x, half y);
	half16 __ovld __cnfn fmax(half16 x, half y);
	#endif //cl_khr_fp16

	/**
	* Returns y if y < x, otherwise it returns x. If one
	* argument is a NaN, fmin() returns the other
	* argument. If both arguments are NaNs, fmin()
	* returns a NaN.
	*/
	float __ovld __cnfn fmin(float x, float y);
	float2 __ovld __cnfn fmin(float2 x, float2 y);
	float3 __ovld __cnfn fmin(float3 x, float3 y);
	float4 __ovld __cnfn fmin(float4 x, float4 y);
	float8 __ovld __cnfn fmin(float8 x, float8 y);
	float16 __ovld __cnfn fmin(float16 x, float16 y);
	float2 __ovld __cnfn fmin(float2 x, float y);
	float3 __ovld __cnfn fmin(float3 x, float y);
	float4 __ovld __cnfn fmin(float4 x, float y);
	float8 __ovld __cnfn fmin(float8 x, float y);
	float16 __ovld __cnfn fmin(float16 x, float y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn fmin(double x, double y);
	double2 __ovld __cnfn fmin(double2 x, double2 y);
	double3 __ovld __cnfn fmin(double3 x, double3 y);
	double4 __ovld __cnfn fmin(double4 x, double4 y);
	double8 __ovld __cnfn fmin(double8 x, double8 y);
	double16 __ovld __cnfn fmin(double16 x, double16 y);
	double2 __ovld __cnfn fmin(double2 x, double y);
	double3 __ovld __cnfn fmin(double3 x, double y);
	double4 __ovld __cnfn fmin(double4 x, double y);
	double8 __ovld __cnfn fmin(double8 x, double y);
	double16 __ovld __cnfn fmin(double16 x, double y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn fmin(half x, half y);
	half2 __ovld __cnfn fmin(half2 x, half2 y);
	half3 __ovld __cnfn fmin(half3 x, half3 y);
	half4 __ovld __cnfn fmin(half4 x, half4 y);
	half8 __ovld __cnfn fmin(half8 x, half8 y);
	half16 __ovld __cnfn fmin(half16 x, half16 y);
	half2 __ovld __cnfn fmin(half2 x, half y);
	half3 __ovld __cnfn fmin(half3 x, half y);
	half4 __ovld __cnfn fmin(half4 x, half y);
	half8 __ovld __cnfn fmin(half8 x, half y);
	half16 __ovld __cnfn fmin(half16 x, half y);
	#endif //cl_khr_fp16

	/**
	* Modulus. Returns x - y * trunc (x/y).
	*/
	float __ovld __cnfn fmod(float x, float y);
	float2 __ovld __cnfn fmod(float2 x, float2 y);
	float3 __ovld __cnfn fmod(float3 x, float3 y);
	float4 __ovld __cnfn fmod(float4 x, float4 y);
	float8 __ovld __cnfn fmod(float8 x, float8 y);
	float16 __ovld __cnfn fmod(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn fmod(double x, double y);
	double2 __ovld __cnfn fmod(double2 x, double2 y);
	double3 __ovld __cnfn fmod(double3 x, double3 y);
	double4 __ovld __cnfn fmod(double4 x, double4 y);
	double8 __ovld __cnfn fmod(double8 x, double8 y);
	double16 __ovld __cnfn fmod(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn fmod(half x, half y);
	half2 __ovld __cnfn fmod(half2 x, half2 y);
	half3 __ovld __cnfn fmod(half3 x, half3 y);
	half4 __ovld __cnfn fmod(half4 x, half4 y);
	half8 __ovld __cnfn fmod(half8 x, half8 y);
	half16 __ovld __cnfn fmod(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Returns fmin(x - floor (x), 0x1.fffffep-1f ).
	* floor(x) is returned in iptr.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float __ovld fract(float x, float *iptr);
	float2 __ovld fract(float2 x, float2 *iptr);
	float3 __ovld fract(float3 x, float3 *iptr);
	float4 __ovld fract(float4 x, float4 *iptr);
	float8 __ovld fract(float8 x, float8 *iptr);
	float16 __ovld fract(float16 x, float16 *iptr);
	#ifdef cl_khr_fp64
	double __ovld fract(double x, double *iptr);
	double2 __ovld fract(double2 x, double2 *iptr);
	double3 __ovld fract(double3 x, double3 *iptr);
	double4 __ovld fract(double4 x, double4 *iptr);
	double8 __ovld fract(double8 x, double8 *iptr);
	double16 __ovld fract(double16 x, double16 *iptr);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld fract(half x, half *iptr);
	half2 __ovld fract(half2 x, half2 *iptr);
	half3 __ovld fract(half3 x, half3 *iptr);
	half4 __ovld fract(half4 x, half4 *iptr);
	half8 __ovld fract(half8 x, half8 *iptr);
	half16 __ovld fract(half16 x, half16 *iptr);
	#endif //cl_khr_fp16
	#else
	float __ovld fract(float x, __global float *iptr);
	float2 __ovld fract(float2 x, __global float2 *iptr);
	float3 __ovld fract(float3 x, __global float3 *iptr);
	float4 __ovld fract(float4 x, __global float4 *iptr);
	float8 __ovld fract(float8 x, __global float8 *iptr);
	float16 __ovld fract(float16 x, __global float16 *iptr);
	float __ovld fract(float x, __local float *iptr);
	float2 __ovld fract(float2 x, __local float2 *iptr);
	float3 __ovld fract(float3 x, __local float3 *iptr);
	float4 __ovld fract(float4 x, __local float4 *iptr);
	float8 __ovld fract(float8 x, __local float8 *iptr);
	float16 __ovld fract(float16 x, __local float16 *iptr);
	float __ovld fract(float x, __private float *iptr);
	float2 __ovld fract(float2 x, __private float2 *iptr);
	float3 __ovld fract(float3 x, __private float3 *iptr);
	float4 __ovld fract(float4 x, __private float4 *iptr);
	float8 __ovld fract(float8 x, __private float8 *iptr);
	float16 __ovld fract(float16 x, __private float16 *iptr);
	#ifdef cl_khr_fp64
	double __ovld fract(double x, __global double *iptr);
	double2 __ovld fract(double2 x, __global double2 *iptr);
	double3 __ovld fract(double3 x, __global double3 *iptr);
	double4 __ovld fract(double4 x, __global double4 *iptr);
	double8 __ovld fract(double8 x, __global double8 *iptr);
	double16 __ovld fract(double16 x, __global double16 *iptr);
	double __ovld fract(double x, __local double *iptr);
	double2 __ovld fract(double2 x, __local double2 *iptr);
	double3 __ovld fract(double3 x, __local double3 *iptr);
	double4 __ovld fract(double4 x, __local double4 *iptr);
	double8 __ovld fract(double8 x, __local double8 *iptr);
	double16 __ovld fract(double16 x, __local double16 *iptr);
	double __ovld fract(double x, __private double *iptr);
	double2 __ovld fract(double2 x, __private double2 *iptr);
	double3 __ovld fract(double3 x, __private double3 *iptr);
	double4 __ovld fract(double4 x, __private double4 *iptr);
	double8 __ovld fract(double8 x, __private double8 *iptr);
	double16 __ovld fract(double16 x, __private double16 *iptr);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld fract(half x, __global half *iptr);
	half2 __ovld fract(half2 x, __global half2 *iptr);
	half3 __ovld fract(half3 x, __global half3 *iptr);
	half4 __ovld fract(half4 x, __global half4 *iptr);
	half8 __ovld fract(half8 x, __global half8 *iptr);
	half16 __ovld fract(half16 x, __global half16 *iptr);
	half __ovld fract(half x, __local half *iptr);
	half2 __ovld fract(half2 x, __local half2 *iptr);
	half3 __ovld fract(half3 x, __local half3 *iptr);
	half4 __ovld fract(half4 x, __local half4 *iptr);
	half8 __ovld fract(half8 x, __local half8 *iptr);
	half16 __ovld fract(half16 x, __local half16 *iptr);
	half __ovld fract(half x, __private half *iptr);
	half2 __ovld fract(half2 x, __private half2 *iptr);
	half3 __ovld fract(half3 x, __private half3 *iptr);
	half4 __ovld fract(half4 x, __private half4 *iptr);
	half8 __ovld fract(half8 x, __private half8 *iptr);
	half16 __ovld fract(half16 x, __private half16 *iptr);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Extract mantissa and exponent from x. For each
	* component the mantissa returned is a float with
	* magnitude in the interval [1/2, 1) or 0. Each
	* component of x equals mantissa returned * 2^exp.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float __ovld frexp(float x, int *exp);
	float2 __ovld frexp(float2 x, int2 *exp);
	float3 __ovld frexp(float3 x, int3 *exp);
	float4 __ovld frexp(float4 x, int4 *exp);
	float8 __ovld frexp(float8 x, int8 *exp);
	float16 __ovld frexp(float16 x, int16 *exp);
	#ifdef cl_khr_fp64
	double __ovld frexp(double x, int *exp);
	double2 __ovld frexp(double2 x, int2 *exp);
	double3 __ovld frexp(double3 x, int3 *exp);
	double4 __ovld frexp(double4 x, int4 *exp);
	double8 __ovld frexp(double8 x, int8 *exp);
	double16 __ovld frexp(double16 x, int16 *exp);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld frexp(half x, int *exp);
	half2 __ovld frexp(half2 x, int2 *exp);
	half3 __ovld frexp(half3 x, int3 *exp);
	half4 __ovld frexp(half4 x, int4 *exp);
	half8 __ovld frexp(half8 x, int8 *exp);
	half16 __ovld frexp(half16 x, int16 *exp);
	#endif //cl_khr_fp16
	#else
	float __ovld frexp(float x, __global int *exp);
	float2 __ovld frexp(float2 x, __global int2 *exp);
	float3 __ovld frexp(float3 x, __global int3 *exp);
	float4 __ovld frexp(float4 x, __global int4 *exp);
	float8 __ovld frexp(float8 x, __global int8 *exp);
	float16 __ovld frexp(float16 x, __global int16 *exp);
	float __ovld frexp(float x, __local int *exp);
	float2 __ovld frexp(float2 x, __local int2 *exp);
	float3 __ovld frexp(float3 x, __local int3 *exp);
	float4 __ovld frexp(float4 x, __local int4 *exp);
	float8 __ovld frexp(float8 x, __local int8 *exp);
	float16 __ovld frexp(float16 x, __local int16 *exp);
	float __ovld frexp(float x, __private int *exp);
	float2 __ovld frexp(float2 x, __private int2 *exp);
	float3 __ovld frexp(float3 x, __private int3 *exp);
	float4 __ovld frexp(float4 x, __private int4 *exp);
	float8 __ovld frexp(float8 x, __private int8 *exp);
	float16 __ovld frexp(float16 x, __private int16 *exp);
	#ifdef cl_khr_fp64
	double __ovld frexp(double x, __global int *exp);
	double2 __ovld frexp(double2 x, __global int2 *exp);
	double3 __ovld frexp(double3 x, __global int3 *exp);
	double4 __ovld frexp(double4 x, __global int4 *exp);
	double8 __ovld frexp(double8 x, __global int8 *exp);
	double16 __ovld frexp(double16 x, __global int16 *exp);
	double __ovld frexp(double x, __local int *exp);
	double2 __ovld frexp(double2 x, __local int2 *exp);
	double3 __ovld frexp(double3 x, __local int3 *exp);
	double4 __ovld frexp(double4 x, __local int4 *exp);
	double8 __ovld frexp(double8 x, __local int8 *exp);
	double16 __ovld frexp(double16 x, __local int16 *exp);
	double __ovld frexp(double x, __private int *exp);
	double2 __ovld frexp(double2 x, __private int2 *exp);
	double3 __ovld frexp(double3 x, __private int3 *exp);
	double4 __ovld frexp(double4 x, __private int4 *exp);
	double8 __ovld frexp(double8 x, __private int8 *exp);
	double16 __ovld frexp(double16 x, __private int16 *exp);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld frexp(half x, __global int *exp);
	half2 __ovld frexp(half2 x, __global int2 *exp);
	half3 __ovld frexp(half3 x, __global int3 *exp);
	half4 __ovld frexp(half4 x, __global int4 *exp);
	half8 __ovld frexp(half8 x, __global int8 *exp);
	half16 __ovld frexp(half16 x, __global int16 *exp);
	half __ovld frexp(half x, __local int *exp);
	half2 __ovld frexp(half2 x, __local int2 *exp);
	half3 __ovld frexp(half3 x, __local int3 *exp);
	half4 __ovld frexp(half4 x, __local int4 *exp);
	half8 __ovld frexp(half8 x, __local int8 *exp);
	half16 __ovld frexp(half16 x, __local int16 *exp);
	half __ovld frexp(half x, __private int *exp);
	half2 __ovld frexp(half2 x, __private int2 *exp);
	half3 __ovld frexp(half3 x, __private int3 *exp);
	half4 __ovld frexp(half4 x, __private int4 *exp);
	half8 __ovld frexp(half8 x, __private int8 *exp);
	half16 __ovld frexp(half16 x, __private int16 *exp);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Compute the value of the square root of x^2 + y^2
	* without undue overflow or underflow.
	*/
	float __ovld __cnfn hypot(float x, float y);
	float2 __ovld __cnfn hypot(float2 x, float2 y);
	float3 __ovld __cnfn hypot(float3 x, float3 y);
	float4 __ovld __cnfn hypot(float4 x, float4 y);
	float8 __ovld __cnfn hypot(float8 x, float8 y);
	float16 __ovld __cnfn hypot(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn hypot(double x, double y);
	double2 __ovld __cnfn hypot(double2 x, double2 y);
	double3 __ovld __cnfn hypot(double3 x, double3 y);
	double4 __ovld __cnfn hypot(double4 x, double4 y);
	double8 __ovld __cnfn hypot(double8 x, double8 y);
	double16 __ovld __cnfn hypot(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn hypot(half x, half y);
	half2 __ovld __cnfn hypot(half2 x, half2 y);
	half3 __ovld __cnfn hypot(half3 x, half3 y);
	half4 __ovld __cnfn hypot(half4 x, half4 y);
	half8 __ovld __cnfn hypot(half8 x, half8 y);
	half16 __ovld __cnfn hypot(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Return the exponent as an integer value.
	*/
	int __ovld __cnfn ilogb(float x);
	int2 __ovld __cnfn ilogb(float2 x);
	int3 __ovld __cnfn ilogb(float3 x);
	int4 __ovld __cnfn ilogb(float4 x);
	int8 __ovld __cnfn ilogb(float8 x);
	int16 __ovld __cnfn ilogb(float16 x);
	#ifdef cl_khr_fp64
	int __ovld __cnfn ilogb(double x);
	int2 __ovld __cnfn ilogb(double2 x);
	int3 __ovld __cnfn ilogb(double3 x);
	int4 __ovld __cnfn ilogb(double4 x);
	int8 __ovld __cnfn ilogb(double8 x);
	int16 __ovld __cnfn ilogb(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn ilogb(half x);
	int2 __ovld __cnfn ilogb(half2 x);
	int3 __ovld __cnfn ilogb(half3 x);
	int4 __ovld __cnfn ilogb(half4 x);
	int8 __ovld __cnfn ilogb(half8 x);
	int16 __ovld __cnfn ilogb(half16 x);
	#endif //cl_khr_fp16

	/**
	* Multiply x by 2 to the power n.
	*/
	float __ovld __cnfn ldexp(float x, int n);
	float2 __ovld __cnfn ldexp(float2 x, int2 n);
	float3 __ovld __cnfn ldexp(float3 x, int3 n);
	float4 __ovld __cnfn ldexp(float4 x, int4 n);
	float8 __ovld __cnfn ldexp(float8 x, int8 n);
	float16 __ovld __cnfn ldexp(float16 x, int16 n);
	float2 __ovld __cnfn ldexp(float2 x, int n);
	float3 __ovld __cnfn ldexp(float3 x, int n);
	float4 __ovld __cnfn ldexp(float4 x, int n);
	float8 __ovld __cnfn ldexp(float8 x, int n);
	float16 __ovld __cnfn ldexp(float16 x, int n);
	#ifdef cl_khr_fp64
	double __ovld __cnfn ldexp(double x, int n);
	double2 __ovld __cnfn ldexp(double2 x, int2 n);
	double3 __ovld __cnfn ldexp(double3 x, int3 n);
	double4 __ovld __cnfn ldexp(double4 x, int4 n);
	double8 __ovld __cnfn ldexp(double8 x, int8 n);
	double16 __ovld __cnfn ldexp(double16 x, int16 n);
	double2 __ovld __cnfn ldexp(double2 x, int n);
	double3 __ovld __cnfn ldexp(double3 x, int n);
	double4 __ovld __cnfn ldexp(double4 x, int n);
	double8 __ovld __cnfn ldexp(double8 x, int n);
	double16 __ovld __cnfn ldexp(double16 x, int n);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn ldexp(half x, int n);
	half2 __ovld __cnfn ldexp(half2 x, int2 n);
	half3 __ovld __cnfn ldexp(half3 x, int3 n);
	half4 __ovld __cnfn ldexp(half4 x, int4 n);
	half8 __ovld __cnfn ldexp(half8 x, int8 n);
	half16 __ovld __cnfn ldexp(half16 x, int16 n);
	half2 __ovld __cnfn ldexp(half2 x, int n);
	half3 __ovld __cnfn ldexp(half3 x, int n);
	half4 __ovld __cnfn ldexp(half4 x, int n);
	half8 __ovld __cnfn ldexp(half8 x, int n);
	half16 __ovld __cnfn ldexp(half16 x, int n);
	#endif //cl_khr_fp16

	/**
	* Log gamma function. Returns the natural
	* logarithm of the absolute value of the gamma
	* function. The sign of the gamma function is
	* returned in the signp argument of lgamma_r.
	*/
	float __ovld __cnfn lgamma(float x);
	float2 __ovld __cnfn lgamma(float2 x);
	float3 __ovld __cnfn lgamma(float3 x);
	float4 __ovld __cnfn lgamma(float4 x);
	float8 __ovld __cnfn lgamma(float8 x);
	float16 __ovld __cnfn lgamma(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn lgamma(double x);
	double2 __ovld __cnfn lgamma(double2 x);
	double3 __ovld __cnfn lgamma(double3 x);
	double4 __ovld __cnfn lgamma(double4 x);
	double8 __ovld __cnfn lgamma(double8 x);
	double16 __ovld __cnfn lgamma(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn lgamma(half x);
	half2 __ovld __cnfn lgamma(half2 x);
	half3 __ovld __cnfn lgamma(half3 x);
	half4 __ovld __cnfn lgamma(half4 x);
	half8 __ovld __cnfn lgamma(half8 x);
	half16 __ovld __cnfn lgamma(half16 x);
	#endif //cl_khr_fp16

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float __ovld lgamma_r(float x, int *signp);
	float2 __ovld lgamma_r(float2 x, int2 *signp);
	float3 __ovld lgamma_r(float3 x, int3 *signp);
	float4 __ovld lgamma_r(float4 x, int4 *signp);
	float8 __ovld lgamma_r(float8 x, int8 *signp);
	float16 __ovld lgamma_r(float16 x, int16 *signp);
	#ifdef cl_khr_fp64
	double __ovld lgamma_r(double x, int *signp);
	double2 __ovld lgamma_r(double2 x, int2 *signp);
	double3 __ovld lgamma_r(double3 x, int3 *signp);
	double4 __ovld lgamma_r(double4 x, int4 *signp);
	double8 __ovld lgamma_r(double8 x, int8 *signp);
	double16 __ovld lgamma_r(double16 x, int16 *signp);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld lgamma_r(half x, int *signp);
	half2 __ovld lgamma_r(half2 x, int2 *signp);
	half3 __ovld lgamma_r(half3 x, int3 *signp);
	half4 __ovld lgamma_r(half4 x, int4 *signp);
	half8 __ovld lgamma_r(half8 x, int8 *signp);
	half16 __ovld lgamma_r(half16 x, int16 *signp);
	#endif //cl_khr_fp16
	#else
	float __ovld lgamma_r(float x, __global int *signp);
	float2 __ovld lgamma_r(float2 x, __global int2 *signp);
	float3 __ovld lgamma_r(float3 x, __global int3 *signp);
	float4 __ovld lgamma_r(float4 x, __global int4 *signp);
	float8 __ovld lgamma_r(float8 x, __global int8 *signp);
	float16 __ovld lgamma_r(float16 x, __global int16 *signp);
	float __ovld lgamma_r(float x, __local int *signp);
	float2 __ovld lgamma_r(float2 x, __local int2 *signp);
	float3 __ovld lgamma_r(float3 x, __local int3 *signp);
	float4 __ovld lgamma_r(float4 x, __local int4 *signp);
	float8 __ovld lgamma_r(float8 x, __local int8 *signp);
	float16 __ovld lgamma_r(float16 x, __local int16 *signp);
	float __ovld lgamma_r(float x, __private int *signp);
	float2 __ovld lgamma_r(float2 x, __private int2 *signp);
	float3 __ovld lgamma_r(float3 x, __private int3 *signp);
	float4 __ovld lgamma_r(float4 x, __private int4 *signp);
	float8 __ovld lgamma_r(float8 x, __private int8 *signp);
	float16 __ovld lgamma_r(float16 x, __private int16 *signp);
	#ifdef cl_khr_fp64
	double __ovld lgamma_r(double x, __global int *signp);
	double2 __ovld lgamma_r(double2 x, __global int2 *signp);
	double3 __ovld lgamma_r(double3 x, __global int3 *signp);
	double4 __ovld lgamma_r(double4 x, __global int4 *signp);
	double8 __ovld lgamma_r(double8 x, __global int8 *signp);
	double16 __ovld lgamma_r(double16 x, __global int16 *signp);
	double __ovld lgamma_r(double x, __local int *signp);
	double2 __ovld lgamma_r(double2 x, __local int2 *signp);
	double3 __ovld lgamma_r(double3 x, __local int3 *signp);
	double4 __ovld lgamma_r(double4 x, __local int4 *signp);
	double8 __ovld lgamma_r(double8 x, __local int8 *signp);
	double16 __ovld lgamma_r(double16 x, __local int16 *signp);
	double __ovld lgamma_r(double x, __private int *signp);
	double2 __ovld lgamma_r(double2 x, __private int2 *signp);
	double3 __ovld lgamma_r(double3 x, __private int3 *signp);
	double4 __ovld lgamma_r(double4 x, __private int4 *signp);
	double8 __ovld lgamma_r(double8 x, __private int8 *signp);
	double16 __ovld lgamma_r(double16 x, __private int16 *signp);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld lgamma_r(half x, __global int *signp);
	half2 __ovld lgamma_r(half2 x, __global int2 *signp);
	half3 __ovld lgamma_r(half3 x, __global int3 *signp);
	half4 __ovld lgamma_r(half4 x, __global int4 *signp);
	half8 __ovld lgamma_r(half8 x, __global int8 *signp);
	half16 __ovld lgamma_r(half16 x, __global int16 *signp);
	half __ovld lgamma_r(half x, __local int *signp);
	half2 __ovld lgamma_r(half2 x, __local int2 *signp);
	half3 __ovld lgamma_r(half3 x, __local int3 *signp);
	half4 __ovld lgamma_r(half4 x, __local int4 *signp);
	half8 __ovld lgamma_r(half8 x, __local int8 *signp);
	half16 __ovld lgamma_r(half16 x, __local int16 *signp);
	half __ovld lgamma_r(half x, __private int *signp);
	half2 __ovld lgamma_r(half2 x, __private int2 *signp);
	half3 __ovld lgamma_r(half3 x, __private int3 *signp);
	half4 __ovld lgamma_r(half4 x, __private int4 *signp);
	half8 __ovld lgamma_r(half8 x, __private int8 *signp);
	half16 __ovld lgamma_r(half16 x, __private int16 *signp);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Compute natural logarithm.
	*/
	float __ovld __cnfn log(float);
	float2 __ovld __cnfn log(float2);
	float3 __ovld __cnfn log(float3);
	float4 __ovld __cnfn log(float4);
	float8 __ovld __cnfn log(float8);
	float16 __ovld __cnfn log(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn log(double);
	double2 __ovld __cnfn log(double2);
	double3 __ovld __cnfn log(double3);
	double4 __ovld __cnfn log(double4);
	double8 __ovld __cnfn log(double8);
	double16 __ovld __cnfn log(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn log(half);
	half2 __ovld __cnfn log(half2);
	half3 __ovld __cnfn log(half3);
	half4 __ovld __cnfn log(half4);
	half8 __ovld __cnfn log(half8);
	half16 __ovld __cnfn log(half16);
	#endif //cl_khr_fp16

	/**
	* Compute a base 2 logarithm.
	*/
	float __ovld __cnfn log2(float);
	float2 __ovld __cnfn log2(float2);
	float3 __ovld __cnfn log2(float3);
	float4 __ovld __cnfn log2(float4);
	float8 __ovld __cnfn log2(float8);
	float16 __ovld __cnfn log2(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn log2(double);
	double2 __ovld __cnfn log2(double2);
	double3 __ovld __cnfn log2(double3);
	double4 __ovld __cnfn log2(double4);
	double8 __ovld __cnfn log2(double8);
	double16 __ovld __cnfn log2(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn log2(half);
	half2 __ovld __cnfn log2(half2);
	half3 __ovld __cnfn log2(half3);
	half4 __ovld __cnfn log2(half4);
	half8 __ovld __cnfn log2(half8);
	half16 __ovld __cnfn log2(half16);
	#endif //cl_khr_fp16

	/**
	* Compute a base 10 logarithm.
	*/
	float __ovld __cnfn log10(float);
	float2 __ovld __cnfn log10(float2);
	float3 __ovld __cnfn log10(float3);
	float4 __ovld __cnfn log10(float4);
	float8 __ovld __cnfn log10(float8);
	float16 __ovld __cnfn log10(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn log10(double);
	double2 __ovld __cnfn log10(double2);
	double3 __ovld __cnfn log10(double3);
	double4 __ovld __cnfn log10(double4);
	double8 __ovld __cnfn log10(double8);
	double16 __ovld __cnfn log10(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn log10(half);
	half2 __ovld __cnfn log10(half2);
	half3 __ovld __cnfn log10(half3);
	half4 __ovld __cnfn log10(half4);
	half8 __ovld __cnfn log10(half8);
	half16 __ovld __cnfn log10(half16);
	#endif //cl_khr_fp16

	/**
	* Compute a base e logarithm of (1.0 + x).
	*/
	float __ovld __cnfn log1p(float x);
	float2 __ovld __cnfn log1p(float2 x);
	float3 __ovld __cnfn log1p(float3 x);
	float4 __ovld __cnfn log1p(float4 x);
	float8 __ovld __cnfn log1p(float8 x);
	float16 __ovld __cnfn log1p(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn log1p(double x);
	double2 __ovld __cnfn log1p(double2 x);
	double3 __ovld __cnfn log1p(double3 x);
	double4 __ovld __cnfn log1p(double4 x);
	double8 __ovld __cnfn log1p(double8 x);
	double16 __ovld __cnfn log1p(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn log1p(half x);
	half2 __ovld __cnfn log1p(half2 x);
	half3 __ovld __cnfn log1p(half3 x);
	half4 __ovld __cnfn log1p(half4 x);
	half8 __ovld __cnfn log1p(half8 x);
	half16 __ovld __cnfn log1p(half16 x);
	#endif //cl_khr_fp16

	/**
	* Compute the exponent of x, which is the integral
	* part of logr \| x \|.
	*/
	float __ovld __cnfn logb(float x);
	float2 __ovld __cnfn logb(float2 x);
	float3 __ovld __cnfn logb(float3 x);
	float4 __ovld __cnfn logb(float4 x);
	float8 __ovld __cnfn logb(float8 x);
	float16 __ovld __cnfn logb(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn logb(double x);
	double2 __ovld __cnfn logb(double2 x);
	double3 __ovld __cnfn logb(double3 x);
	double4 __ovld __cnfn logb(double4 x);
	double8 __ovld __cnfn logb(double8 x);
	double16 __ovld __cnfn logb(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn logb(half x);
	half2 __ovld __cnfn logb(half2 x);
	half3 __ovld __cnfn logb(half3 x);
	half4 __ovld __cnfn logb(half4 x);
	half8 __ovld __cnfn logb(half8 x);
	half16 __ovld __cnfn logb(half16 x);
	#endif //cl_khr_fp16

	/**
	* mad approximates a * b + c. Whether or how the
	* product of a * b is rounded and how supernormal or
	* subnormal intermediate products are handled is not
	* defined. mad is intended to be used where speed is
	* preferred over accuracy.
	*/
	float __ovld __cnfn mad(float a, float b, float c);
	float2 __ovld __cnfn mad(float2 a, float2 b, float2 c);
	float3 __ovld __cnfn mad(float3 a, float3 b, float3 c);
	float4 __ovld __cnfn mad(float4 a, float4 b, float4 c);
	float8 __ovld __cnfn mad(float8 a, float8 b, float8 c);
	float16 __ovld __cnfn mad(float16 a, float16 b, float16 c);
	#ifdef cl_khr_fp64
	double __ovld __cnfn mad(double a, double b, double c);
	double2 __ovld __cnfn mad(double2 a, double2 b, double2 c);
	double3 __ovld __cnfn mad(double3 a, double3 b, double3 c);
	double4 __ovld __cnfn mad(double4 a, double4 b, double4 c);
	double8 __ovld __cnfn mad(double8 a, double8 b, double8 c);
	double16 __ovld __cnfn mad(double16 a, double16 b, double16 c);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn mad(half a, half b, half c);
	half2 __ovld __cnfn mad(half2 a, half2 b, half2 c);
	half3 __ovld __cnfn mad(half3 a, half3 b, half3 c);
	half4 __ovld __cnfn mad(half4 a, half4 b, half4 c);
	half8 __ovld __cnfn mad(half8 a, half8 b, half8 c);
	half16 __ovld __cnfn mad(half16 a, half16 b, half16 c);
	#endif //cl_khr_fp16

	/**
	* Returns x if \| x \| > \| y \|, y if \| y \| > \| x \|, otherwise
	* fmax(x, y).
	*/
	float __ovld __cnfn maxmag(float x, float y);
	float2 __ovld __cnfn maxmag(float2 x, float2 y);
	float3 __ovld __cnfn maxmag(float3 x, float3 y);
	float4 __ovld __cnfn maxmag(float4 x, float4 y);
	float8 __ovld __cnfn maxmag(float8 x, float8 y);
	float16 __ovld __cnfn maxmag(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn maxmag(double x, double y);
	double2 __ovld __cnfn maxmag(double2 x, double2 y);
	double3 __ovld __cnfn maxmag(double3 x, double3 y);
	double4 __ovld __cnfn maxmag(double4 x, double4 y);
	double8 __ovld __cnfn maxmag(double8 x, double8 y);
	double16 __ovld __cnfn maxmag(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn maxmag(half x, half y);
	half2 __ovld __cnfn maxmag(half2 x, half2 y);
	half3 __ovld __cnfn maxmag(half3 x, half3 y);
	half4 __ovld __cnfn maxmag(half4 x, half4 y);
	half8 __ovld __cnfn maxmag(half8 x, half8 y);
	half16 __ovld __cnfn maxmag(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Returns x if \| x \| < \| y \|, y if \| y \| < \| x \|, otherwise
	* fmin(x, y).
	*/
	float __ovld __cnfn minmag(float x, float y);
	float2 __ovld __cnfn minmag(float2 x, float2 y);
	float3 __ovld __cnfn minmag(float3 x, float3 y);
	float4 __ovld __cnfn minmag(float4 x, float4 y);
	float8 __ovld __cnfn minmag(float8 x, float8 y);
	float16 __ovld __cnfn minmag(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn minmag(double x, double y);
	double2 __ovld __cnfn minmag(double2 x, double2 y);
	double3 __ovld __cnfn minmag(double3 x, double3 y);
	double4 __ovld __cnfn minmag(double4 x, double4 y);
	double8 __ovld __cnfn minmag(double8 x, double8 y);
	double16 __ovld __cnfn minmag(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn minmag(half x, half y);
	half2 __ovld __cnfn minmag(half2 x, half2 y);
	half3 __ovld __cnfn minmag(half3 x, half3 y);
	half4 __ovld __cnfn minmag(half4 x, half4 y);
	half8 __ovld __cnfn minmag(half8 x, half8 y);
	half16 __ovld __cnfn minmag(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Decompose a floating-point number. The modf
	* function breaks the argument x into integral and
	* fractional parts, each of which has the same sign as
	* the argument. It stores the integral part in the object
	* pointed to by iptr.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float __ovld modf(float x, float *iptr);
	float2 __ovld modf(float2 x, float2 *iptr);
	float3 __ovld modf(float3 x, float3 *iptr);
	float4 __ovld modf(float4 x, float4 *iptr);
	float8 __ovld modf(float8 x, float8 *iptr);
	float16 __ovld modf(float16 x, float16 *iptr);
	#ifdef cl_khr_fp64
	double __ovld modf(double x, double *iptr);
	double2 __ovld modf(double2 x, double2 *iptr);
	double3 __ovld modf(double3 x, double3 *iptr);
	double4 __ovld modf(double4 x, double4 *iptr);
	double8 __ovld modf(double8 x, double8 *iptr);
	double16 __ovld modf(double16 x, double16 *iptr);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld modf(half x, half *iptr);
	half2 __ovld modf(half2 x, half2 *iptr);
	half3 __ovld modf(half3 x, half3 *iptr);
	half4 __ovld modf(half4 x, half4 *iptr);
	half8 __ovld modf(half8 x, half8 *iptr);
	half16 __ovld modf(half16 x, half16 *iptr);
	#endif //cl_khr_fp16
	#else
	float __ovld modf(float x, __global float *iptr);
	float2 __ovld modf(float2 x, __global float2 *iptr);
	float3 __ovld modf(float3 x, __global float3 *iptr);
	float4 __ovld modf(float4 x, __global float4 *iptr);
	float8 __ovld modf(float8 x, __global float8 *iptr);
	float16 __ovld modf(float16 x, __global float16 *iptr);
	float __ovld modf(float x, __local float *iptr);
	float2 __ovld modf(float2 x, __local float2 *iptr);
	float3 __ovld modf(float3 x, __local float3 *iptr);
	float4 __ovld modf(float4 x, __local float4 *iptr);
	float8 __ovld modf(float8 x, __local float8 *iptr);
	float16 __ovld modf(float16 x, __local float16 *iptr);
	float __ovld modf(float x, __private float *iptr);
	float2 __ovld modf(float2 x, __private float2 *iptr);
	float3 __ovld modf(float3 x, __private float3 *iptr);
	float4 __ovld modf(float4 x, __private float4 *iptr);
	float8 __ovld modf(float8 x, __private float8 *iptr);
	float16 __ovld modf(float16 x, __private float16 *iptr);
	#ifdef cl_khr_fp64
	double __ovld modf(double x, __global double *iptr);
	double2 __ovld modf(double2 x, __global double2 *iptr);
	double3 __ovld modf(double3 x, __global double3 *iptr);
	double4 __ovld modf(double4 x, __global double4 *iptr);
	double8 __ovld modf(double8 x, __global double8 *iptr);
	double16 __ovld modf(double16 x, __global double16 *iptr);
	double __ovld modf(double x, __local double *iptr);
	double2 __ovld modf(double2 x, __local double2 *iptr);
	double3 __ovld modf(double3 x, __local double3 *iptr);
	double4 __ovld modf(double4 x, __local double4 *iptr);
	double8 __ovld modf(double8 x, __local double8 *iptr);
	double16 __ovld modf(double16 x, __local double16 *iptr);
	double __ovld modf(double x, __private double *iptr);
	double2 __ovld modf(double2 x, __private double2 *iptr);
	double3 __ovld modf(double3 x, __private double3 *iptr);
	double4 __ovld modf(double4 x, __private double4 *iptr);
	double8 __ovld modf(double8 x, __private double8 *iptr);
	double16 __ovld modf(double16 x, __private double16 *iptr);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld modf(half x, __global half *iptr);
	half2 __ovld modf(half2 x, __global half2 *iptr);
	half3 __ovld modf(half3 x, __global half3 *iptr);
	half4 __ovld modf(half4 x, __global half4 *iptr);
	half8 __ovld modf(half8 x, __global half8 *iptr);
	half16 __ovld modf(half16 x, __global half16 *iptr);
	half __ovld modf(half x, __local half *iptr);
	half2 __ovld modf(half2 x, __local half2 *iptr);
	half3 __ovld modf(half3 x, __local half3 *iptr);
	half4 __ovld modf(half4 x, __local half4 *iptr);
	half8 __ovld modf(half8 x, __local half8 *iptr);
	half16 __ovld modf(half16 x, __local half16 *iptr);
	half __ovld modf(half x, __private half *iptr);
	half2 __ovld modf(half2 x, __private half2 *iptr);
	half3 __ovld modf(half3 x, __private half3 *iptr);
	half4 __ovld modf(half4 x, __private half4 *iptr);
	half8 __ovld modf(half8 x, __private half8 *iptr);
	half16 __ovld modf(half16 x, __private half16 *iptr);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Returns a quiet NaN. The nancode may be placed
	* in the significand of the resulting NaN.
	*/
	float __ovld __cnfn nan(uint nancode);
	float2 __ovld __cnfn nan(uint2 nancode);
	float3 __ovld __cnfn nan(uint3 nancode);
	float4 __ovld __cnfn nan(uint4 nancode);
	float8 __ovld __cnfn nan(uint8 nancode);
	float16 __ovld __cnfn nan(uint16 nancode);
	#ifdef cl_khr_fp64
	double __ovld __cnfn nan(ulong nancode);
	double2 __ovld __cnfn nan(ulong2 nancode);
	double3 __ovld __cnfn nan(ulong3 nancode);
	double4 __ovld __cnfn nan(ulong4 nancode);
	double8 __ovld __cnfn nan(ulong8 nancode);
	double16 __ovld __cnfn nan(ulong16 nancode);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn nan(ushort nancode);
	half2 __ovld __cnfn nan(ushort2 nancode);
	half3 __ovld __cnfn nan(ushort3 nancode);
	half4 __ovld __cnfn nan(ushort4 nancode);
	half8 __ovld __cnfn nan(ushort8 nancode);
	half16 __ovld __cnfn nan(ushort16 nancode);
	#endif //cl_khr_fp16

	/**
	* Computes the next representable single-precision
	* floating-point value following x in the direction of
	* y. Thus, if y is less than x, nextafter() returns the
	* largest representable floating-point number less
	* than x.
	*/
	float __ovld __cnfn nextafter(float x, float y);
	float2 __ovld __cnfn nextafter(float2 x, float2 y);
	float3 __ovld __cnfn nextafter(float3 x, float3 y);
	float4 __ovld __cnfn nextafter(float4 x, float4 y);
	float8 __ovld __cnfn nextafter(float8 x, float8 y);
	float16 __ovld __cnfn nextafter(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn nextafter(double x, double y);
	double2 __ovld __cnfn nextafter(double2 x, double2 y);
	double3 __ovld __cnfn nextafter(double3 x, double3 y);
	double4 __ovld __cnfn nextafter(double4 x, double4 y);
	double8 __ovld __cnfn nextafter(double8 x, double8 y);
	double16 __ovld __cnfn nextafter(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn nextafter(half x, half y);
	half2 __ovld __cnfn nextafter(half2 x, half2 y);
	half3 __ovld __cnfn nextafter(half3 x, half3 y);
	half4 __ovld __cnfn nextafter(half4 x, half4 y);
	half8 __ovld __cnfn nextafter(half8 x, half8 y);
	half16 __ovld __cnfn nextafter(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Compute x to the power y.
	*/
	float __ovld __cnfn pow(float x, float y);
	float2 __ovld __cnfn pow(float2 x, float2 y);
	float3 __ovld __cnfn pow(float3 x, float3 y);
	float4 __ovld __cnfn pow(float4 x, float4 y);
	float8 __ovld __cnfn pow(float8 x, float8 y);
	float16 __ovld __cnfn pow(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn pow(double x, double y);
	double2 __ovld __cnfn pow(double2 x, double2 y);
	double3 __ovld __cnfn pow(double3 x, double3 y);
	double4 __ovld __cnfn pow(double4 x, double4 y);
	double8 __ovld __cnfn pow(double8 x, double8 y);
	double16 __ovld __cnfn pow(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn pow(half x, half y);
	half2 __ovld __cnfn pow(half2 x, half2 y);
	half3 __ovld __cnfn pow(half3 x, half3 y);
	half4 __ovld __cnfn pow(half4 x, half4 y);
	half8 __ovld __cnfn pow(half8 x, half8 y);
	half16 __ovld __cnfn pow(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Compute x to the power y, where y is an integer.
	*/
	float __ovld __cnfn pown(float x, int y);
	float2 __ovld __cnfn pown(float2 x, int2 y);
	float3 __ovld __cnfn pown(float3 x, int3 y);
	float4 __ovld __cnfn pown(float4 x, int4 y);
	float8 __ovld __cnfn pown(float8 x, int8 y);
	float16 __ovld __cnfn pown(float16 x, int16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn pown(double x, int y);
	double2 __ovld __cnfn pown(double2 x, int2 y);
	double3 __ovld __cnfn pown(double3 x, int3 y);
	double4 __ovld __cnfn pown(double4 x, int4 y);
	double8 __ovld __cnfn pown(double8 x, int8 y);
	double16 __ovld __cnfn pown(double16 x, int16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn pown(half x, int y);
	half2 __ovld __cnfn pown(half2 x, int2 y);
	half3 __ovld __cnfn pown(half3 x, int3 y);
	half4 __ovld __cnfn pown(half4 x, int4 y);
	half8 __ovld __cnfn pown(half8 x, int8 y);
	half16 __ovld __cnfn pown(half16 x, int16 y);
	#endif //cl_khr_fp16

	/**
	* Compute x to the power y, where x is >= 0.
	*/
	float __ovld __cnfn powr(float x, float y);
	float2 __ovld __cnfn powr(float2 x, float2 y);
	float3 __ovld __cnfn powr(float3 x, float3 y);
	float4 __ovld __cnfn powr(float4 x, float4 y);
	float8 __ovld __cnfn powr(float8 x, float8 y);
	float16 __ovld __cnfn powr(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn powr(double x, double y);
	double2 __ovld __cnfn powr(double2 x, double2 y);
	double3 __ovld __cnfn powr(double3 x, double3 y);
	double4 __ovld __cnfn powr(double4 x, double4 y);
	double8 __ovld __cnfn powr(double8 x, double8 y);
	double16 __ovld __cnfn powr(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn powr(half x, half y);
	half2 __ovld __cnfn powr(half2 x, half2 y);
	half3 __ovld __cnfn powr(half3 x, half3 y);
	half4 __ovld __cnfn powr(half4 x, half4 y);
	half8 __ovld __cnfn powr(half8 x, half8 y);
	half16 __ovld __cnfn powr(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Compute the value r such that r = x - n*y, where n
	* is the integer nearest the exact value of x/y. If there
	* are two integers closest to x/y, n shall be the even
	* one. If r is zero, it is given the same sign as x.
	*/
	float __ovld __cnfn remainder(float x, float y);
	float2 __ovld __cnfn remainder(float2 x, float2 y);
	float3 __ovld __cnfn remainder(float3 x, float3 y);
	float4 __ovld __cnfn remainder(float4 x, float4 y);
	float8 __ovld __cnfn remainder(float8 x, float8 y);
	float16 __ovld __cnfn remainder(float16 x, float16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn remainder(double x, double y);
	double2 __ovld __cnfn remainder(double2 x, double2 y);
	double3 __ovld __cnfn remainder(double3 x, double3 y);
	double4 __ovld __cnfn remainder(double4 x, double4 y);
	double8 __ovld __cnfn remainder(double8 x, double8 y);
	double16 __ovld __cnfn remainder(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn remainder(half x, half y);
	half2 __ovld __cnfn remainder(half2 x, half2 y);
	half3 __ovld __cnfn remainder(half3 x, half3 y);
	half4 __ovld __cnfn remainder(half4 x, half4 y);
	half8 __ovld __cnfn remainder(half8 x, half8 y);
	half16 __ovld __cnfn remainder(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* The remquo function computes the value r such
	* that r = x - n*y, where n is the integer nearest the
	* exact value of x/y. If there are two integers closest
	* to x/y, n shall be the even one. If r is zero, it is
	* given the same sign as x. This is the same value
	* that is returned by the remainder function.
	* remquo also calculates the lower seven bits of the
	* integral quotient x/y, and gives that value the same
	* sign as x/y. It stores this signed value in the object
	* pointed to by quo.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float __ovld remquo(float x, float y, int *quo);
	float2 __ovld remquo(float2 x, float2 y, int2 *quo);
	float3 __ovld remquo(float3 x, float3 y, int3 *quo);
	float4 __ovld remquo(float4 x, float4 y, int4 *quo);
	float8 __ovld remquo(float8 x, float8 y, int8 *quo);
	float16 __ovld remquo(float16 x, float16 y, int16 *quo);
	#ifdef cl_khr_fp64
	double __ovld remquo(double x, double y, int *quo);
	double2 __ovld remquo(double2 x, double2 y, int2 *quo);
	double3 __ovld remquo(double3 x, double3 y, int3 *quo);
	double4 __ovld remquo(double4 x, double4 y, int4 *quo);
	double8 __ovld remquo(double8 x, double8 y, int8 *quo);
	double16 __ovld remquo(double16 x, double16 y, int16 *quo);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld remquo(half x, half y, int *quo);
	half2 __ovld remquo(half2 x, half2 y, int2 *quo);
	half3 __ovld remquo(half3 x, half3 y, int3 *quo);
	half4 __ovld remquo(half4 x, half4 y, int4 *quo);
	half8 __ovld remquo(half8 x, half8 y, int8 *quo);
	half16 __ovld remquo(half16 x, half16 y, int16 *quo);

	#endif //cl_khr_fp16
	#else
	float __ovld remquo(float x, float y, __global int *quo);
	float2 __ovld remquo(float2 x, float2 y, __global int2 *quo);
	float3 __ovld remquo(float3 x, float3 y, __global int3 *quo);
	float4 __ovld remquo(float4 x, float4 y, __global int4 *quo);
	float8 __ovld remquo(float8 x, float8 y, __global int8 *quo);
	float16 __ovld remquo(float16 x, float16 y, __global int16 *quo);
	float __ovld remquo(float x, float y, __local int *quo);
	float2 __ovld remquo(float2 x, float2 y, __local int2 *quo);
	float3 __ovld remquo(float3 x, float3 y, __local int3 *quo);
	float4 __ovld remquo(float4 x, float4 y, __local int4 *quo);
	float8 __ovld remquo(float8 x, float8 y, __local int8 *quo);
	float16 __ovld remquo(float16 x, float16 y, __local int16 *quo);
	float __ovld remquo(float x, float y, __private int *quo);
	float2 __ovld remquo(float2 x, float2 y, __private int2 *quo);
	float3 __ovld remquo(float3 x, float3 y, __private int3 *quo);
	float4 __ovld remquo(float4 x, float4 y, __private int4 *quo);
	float8 __ovld remquo(float8 x, float8 y, __private int8 *quo);
	float16 __ovld remquo(float16 x, float16 y, __private int16 *quo);
	#ifdef cl_khr_fp64
	double __ovld remquo(double x, double y, __global int *quo);
	double2 __ovld remquo(double2 x, double2 y, __global int2 *quo);
	double3 __ovld remquo(double3 x, double3 y, __global int3 *quo);
	double4 __ovld remquo(double4 x, double4 y, __global int4 *quo);
	double8 __ovld remquo(double8 x, double8 y, __global int8 *quo);
	double16 __ovld remquo(double16 x, double16 y, __global int16 *quo);
	double __ovld remquo(double x, double y, __local int *quo);
	double2 __ovld remquo(double2 x, double2 y, __local int2 *quo);
	double3 __ovld remquo(double3 x, double3 y, __local int3 *quo);
	double4 __ovld remquo(double4 x, double4 y, __local int4 *quo);
	double8 __ovld remquo(double8 x, double8 y, __local int8 *quo);
	double16 __ovld remquo(double16 x, double16 y, __local int16 *quo);
	double __ovld remquo(double x, double y, __private int *quo);
	double2 __ovld remquo(double2 x, double2 y, __private int2 *quo);
	double3 __ovld remquo(double3 x, double3 y, __private int3 *quo);
	double4 __ovld remquo(double4 x, double4 y, __private int4 *quo);
	double8 __ovld remquo(double8 x, double8 y, __private int8 *quo);
	double16 __ovld remquo(double16 x, double16 y, __private int16 *quo);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld remquo(half x, half y, __global int *quo);
	half2 __ovld remquo(half2 x, half2 y, __global int2 *quo);
	half3 __ovld remquo(half3 x, half3 y, __global int3 *quo);
	half4 __ovld remquo(half4 x, half4 y, __global int4 *quo);
	half8 __ovld remquo(half8 x, half8 y, __global int8 *quo);
	half16 __ovld remquo(half16 x, half16 y, __global int16 *quo);
	half __ovld remquo(half x, half y, __local int *quo);
	half2 __ovld remquo(half2 x, half2 y, __local int2 *quo);
	half3 __ovld remquo(half3 x, half3 y, __local int3 *quo);
	half4 __ovld remquo(half4 x, half4 y, __local int4 *quo);
	half8 __ovld remquo(half8 x, half8 y, __local int8 *quo);
	half16 __ovld remquo(half16 x, half16 y, __local int16 *quo);
	half __ovld remquo(half x, half y, __private int *quo);
	half2 __ovld remquo(half2 x, half2 y, __private int2 *quo);
	half3 __ovld remquo(half3 x, half3 y, __private int3 *quo);
	half4 __ovld remquo(half4 x, half4 y, __private int4 *quo);
	half8 __ovld remquo(half8 x, half8 y, __private int8 *quo);
	half16 __ovld remquo(half16 x, half16 y, __private int16 *quo);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	/**
	* Round to integral value (using round to nearest
	* even rounding mode) in floating-point format.
	* Refer to section 7.1 for description of rounding
	* modes.
	*/
	float __ovld __cnfn rint(float);
	float2 __ovld __cnfn rint(float2);
	float3 __ovld __cnfn rint(float3);
	float4 __ovld __cnfn rint(float4);
	float8 __ovld __cnfn rint(float8);
	float16 __ovld __cnfn rint(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn rint(double);
	double2 __ovld __cnfn rint(double2);
	double3 __ovld __cnfn rint(double3);
	double4 __ovld __cnfn rint(double4);
	double8 __ovld __cnfn rint(double8);
	double16 __ovld __cnfn rint(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn rint(half);
	half2 __ovld __cnfn rint(half2);
	half3 __ovld __cnfn rint(half3);
	half4 __ovld __cnfn rint(half4);
	half8 __ovld __cnfn rint(half8);
	half16 __ovld __cnfn rint(half16);
	#endif //cl_khr_fp16

	/**
	* Compute x to the power 1/y.
	*/
	float __ovld __cnfn rootn(float x, int y);
	float2 __ovld __cnfn rootn(float2 x, int2 y);
	float3 __ovld __cnfn rootn(float3 x, int3 y);
	float4 __ovld __cnfn rootn(float4 x, int4 y);
	float8 __ovld __cnfn rootn(float8 x, int8 y);
	float16 __ovld __cnfn rootn(float16 x, int16 y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn rootn(double x, int y);
	double2 __ovld __cnfn rootn(double2 x, int2 y);
	double3 __ovld __cnfn rootn(double3 x, int3 y);
	double4 __ovld __cnfn rootn(double4 x, int4 y);
	double8 __ovld __cnfn rootn(double8 x, int8 y);
	double16 __ovld __cnfn rootn(double16 x, int16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn rootn(half x, int y);
	half2 __ovld __cnfn rootn(half2 x, int2 y);
	half3 __ovld __cnfn rootn(half3 x, int3 y);
	half4 __ovld __cnfn rootn(half4 x, int4 y);
	half8 __ovld __cnfn rootn(half8 x, int8 y);
	half16 __ovld __cnfn rootn(half16 x, int16 y);
	#endif //cl_khr_fp16

	/**
	* Return the integral value nearest to x rounding
	* halfway cases away from zero, regardless of the
	* current rounding direction.
	*/
	float __ovld __cnfn round(float x);
	float2 __ovld __cnfn round(float2 x);
	float3 __ovld __cnfn round(float3 x);
	float4 __ovld __cnfn round(float4 x);
	float8 __ovld __cnfn round(float8 x);
	float16 __ovld __cnfn round(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn round(double x);
	double2 __ovld __cnfn round(double2 x);
	double3 __ovld __cnfn round(double3 x);
	double4 __ovld __cnfn round(double4 x);
	double8 __ovld __cnfn round(double8 x);
	double16 __ovld __cnfn round(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn round(half x);
	half2 __ovld __cnfn round(half2 x);
	half3 __ovld __cnfn round(half3 x);
	half4 __ovld __cnfn round(half4 x);
	half8 __ovld __cnfn round(half8 x);
	half16 __ovld __cnfn round(half16 x);
	#endif //cl_khr_fp16

	/**
	* Compute inverse square root.
	*/
	float __ovld __cnfn rsqrt(float);
	float2 __ovld __cnfn rsqrt(float2);
	float3 __ovld __cnfn rsqrt(float3);
	float4 __ovld __cnfn rsqrt(float4);
	float8 __ovld __cnfn rsqrt(float8);
	float16 __ovld __cnfn rsqrt(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn rsqrt(double);
	double2 __ovld __cnfn rsqrt(double2);
	double3 __ovld __cnfn rsqrt(double3);
	double4 __ovld __cnfn rsqrt(double4);
	double8 __ovld __cnfn rsqrt(double8);
	double16 __ovld __cnfn rsqrt(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn rsqrt(half);
	half2 __ovld __cnfn rsqrt(half2);
	half3 __ovld __cnfn rsqrt(half3);
	half4 __ovld __cnfn rsqrt(half4);
	half8 __ovld __cnfn rsqrt(half8);
	half16 __ovld __cnfn rsqrt(half16);
	#endif //cl_khr_fp16

	/**
	* Compute sine.
	*/
	float __ovld __cnfn sin(float);
	float2 __ovld __cnfn sin(float2);
	float3 __ovld __cnfn sin(float3);
	float4 __ovld __cnfn sin(float4);
	float8 __ovld __cnfn sin(float8);
	float16 __ovld __cnfn sin(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn sin(double);
	double2 __ovld __cnfn sin(double2);
	double3 __ovld __cnfn sin(double3);
	double4 __ovld __cnfn sin(double4);
	double8 __ovld __cnfn sin(double8);
	double16 __ovld __cnfn sin(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn sin(half);
	half2 __ovld __cnfn sin(half2);
	half3 __ovld __cnfn sin(half3);
	half4 __ovld __cnfn sin(half4);
	half8 __ovld __cnfn sin(half8);
	half16 __ovld __cnfn sin(half16);
	#endif //cl_khr_fp16

	/**
	* Compute sine and cosine of x. The computed sine
	* is the return value and computed cosine is returned
	* in cosval.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float __ovld sincos(float x, float *cosval);
	float2 __ovld sincos(float2 x, float2 *cosval);
	float3 __ovld sincos(float3 x, float3 *cosval);
	float4 __ovld sincos(float4 x, float4 *cosval);
	float8 __ovld sincos(float8 x, float8 *cosval);
	float16 __ovld sincos(float16 x, float16 *cosval);
	#ifdef cl_khr_fp64
	double __ovld sincos(double x, double *cosval);
	double2 __ovld sincos(double2 x, double2 *cosval);
	double3 __ovld sincos(double3 x, double3 *cosval);
	double4 __ovld sincos(double4 x, double4 *cosval);
	double8 __ovld sincos(double8 x, double8 *cosval);
	double16 __ovld sincos(double16 x, double16 *cosval);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld sincos(half x, half *cosval);
	half2 __ovld sincos(half2 x, half2 *cosval);
	half3 __ovld sincos(half3 x, half3 *cosval);
	half4 __ovld sincos(half4 x, half4 *cosval);
	half8 __ovld sincos(half8 x, half8 *cosval);
	half16 __ovld sincos(half16 x, half16 *cosval);
	#endif //cl_khr_fp16
	#else
	float __ovld sincos(float x, __global float *cosval);
	float2 __ovld sincos(float2 x, __global float2 *cosval);
	float3 __ovld sincos(float3 x, __global float3 *cosval);
	float4 __ovld sincos(float4 x, __global float4 *cosval);
	float8 __ovld sincos(float8 x, __global float8 *cosval);
	float16 __ovld sincos(float16 x, __global float16 *cosval);
	float __ovld sincos(float x, __local float *cosval);
	float2 __ovld sincos(float2 x, __local float2 *cosval);
	float3 __ovld sincos(float3 x, __local float3 *cosval);
	float4 __ovld sincos(float4 x, __local float4 *cosval);
	float8 __ovld sincos(float8 x, __local float8 *cosval);
	float16 __ovld sincos(float16 x, __local float16 *cosval);
	float __ovld sincos(float x, __private float *cosval);
	float2 __ovld sincos(float2 x, __private float2 *cosval);
	float3 __ovld sincos(float3 x, __private float3 *cosval);
	float4 __ovld sincos(float4 x, __private float4 *cosval);
	float8 __ovld sincos(float8 x, __private float8 *cosval);
	float16 __ovld sincos(float16 x, __private float16 *cosval);
	#ifdef cl_khr_fp64
	double __ovld sincos(double x, __global double *cosval);
	double2 __ovld sincos(double2 x, __global double2 *cosval);
	double3 __ovld sincos(double3 x, __global double3 *cosval);
	double4 __ovld sincos(double4 x, __global double4 *cosval);
	double8 __ovld sincos(double8 x, __global double8 *cosval);
	double16 __ovld sincos(double16 x, __global double16 *cosval);
	double __ovld sincos(double x, __local double *cosval);
	double2 __ovld sincos(double2 x, __local double2 *cosval);
	double3 __ovld sincos(double3 x, __local double3 *cosval);
	double4 __ovld sincos(double4 x, __local double4 *cosval);
	double8 __ovld sincos(double8 x, __local double8 *cosval);
	double16 __ovld sincos(double16 x, __local double16 *cosval);
	double __ovld sincos(double x, __private double *cosval);
	double2 __ovld sincos(double2 x, __private double2 *cosval);
	double3 __ovld sincos(double3 x, __private double3 *cosval);
	double4 __ovld sincos(double4 x, __private double4 *cosval);
	double8 __ovld sincos(double8 x, __private double8 *cosval);
	double16 __ovld sincos(double16 x, __private double16 *cosval);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld sincos(half x, __global half *cosval);
	half2 __ovld sincos(half2 x, __global half2 *cosval);
	half3 __ovld sincos(half3 x, __global half3 *cosval);
	half4 __ovld sincos(half4 x, __global half4 *cosval);
	half8 __ovld sincos(half8 x, __global half8 *cosval);
	half16 __ovld sincos(half16 x, __global half16 *cosval);
	half __ovld sincos(half x, __local half *cosval);
	half2 __ovld sincos(half2 x, __local half2 *cosval);
	half3 __ovld sincos(half3 x, __local half3 *cosval);
	half4 __ovld sincos(half4 x, __local half4 *cosval);
	half8 __ovld sincos(half8 x, __local half8 *cosval);
	half16 __ovld sincos(half16 x, __local half16 *cosval);
	half __ovld sincos(half x, __private half *cosval);
	half2 __ovld sincos(half2 x, __private half2 *cosval);
	half3 __ovld sincos(half3 x, __private half3 *cosval);
	half4 __ovld sincos(half4 x, __private half4 *cosval);
	half8 __ovld sincos(half8 x, __private half8 *cosval);
	half16 __ovld sincos(half16 x, __private half16 *cosval);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Compute hyperbolic sine.
	*/
	float __ovld __cnfn sinh(float);
	float2 __ovld __cnfn sinh(float2);
	float3 __ovld __cnfn sinh(float3);
	float4 __ovld __cnfn sinh(float4);
	float8 __ovld __cnfn sinh(float8);
	float16 __ovld __cnfn sinh(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn sinh(double);
	double2 __ovld __cnfn sinh(double2);
	double3 __ovld __cnfn sinh(double3);
	double4 __ovld __cnfn sinh(double4);
	double8 __ovld __cnfn sinh(double8);
	double16 __ovld __cnfn sinh(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn sinh(half);
	half2 __ovld __cnfn sinh(half2);
	half3 __ovld __cnfn sinh(half3);
	half4 __ovld __cnfn sinh(half4);
	half8 __ovld __cnfn sinh(half8);
	half16 __ovld __cnfn sinh(half16);
	#endif //cl_khr_fp16

	/**
	* Compute sin (PI * x).
	*/
	float __ovld __cnfn sinpi(float x);
	float2 __ovld __cnfn sinpi(float2 x);
	float3 __ovld __cnfn sinpi(float3 x);
	float4 __ovld __cnfn sinpi(float4 x);
	float8 __ovld __cnfn sinpi(float8 x);
	float16 __ovld __cnfn sinpi(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn sinpi(double x);
	double2 __ovld __cnfn sinpi(double2 x);
	double3 __ovld __cnfn sinpi(double3 x);
	double4 __ovld __cnfn sinpi(double4 x);
	double8 __ovld __cnfn sinpi(double8 x);
	double16 __ovld __cnfn sinpi(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn sinpi(half x);
	half2 __ovld __cnfn sinpi(half2 x);
	half3 __ovld __cnfn sinpi(half3 x);
	half4 __ovld __cnfn sinpi(half4 x);
	half8 __ovld __cnfn sinpi(half8 x);
	half16 __ovld __cnfn sinpi(half16 x);
	#endif //cl_khr_fp16

	/**
	* Compute square root.
	*/
	float __ovld __cnfn sqrt(float);
	float2 __ovld __cnfn sqrt(float2);
	float3 __ovld __cnfn sqrt(float3);
	float4 __ovld __cnfn sqrt(float4);
	float8 __ovld __cnfn sqrt(float8);
	float16 __ovld __cnfn sqrt(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn sqrt(double);
	double2 __ovld __cnfn sqrt(double2);
	double3 __ovld __cnfn sqrt(double3);
	double4 __ovld __cnfn sqrt(double4);
	double8 __ovld __cnfn sqrt(double8);
	double16 __ovld __cnfn sqrt(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn sqrt(half);
	half2 __ovld __cnfn sqrt(half2);
	half3 __ovld __cnfn sqrt(half3);
	half4 __ovld __cnfn sqrt(half4);
	half8 __ovld __cnfn sqrt(half8);
	half16 __ovld __cnfn sqrt(half16);
	#endif //cl_khr_fp16

	/**
	* Compute tangent.
	*/
	float __ovld __cnfn tan(float);
	float2 __ovld __cnfn tan(float2);
	float3 __ovld __cnfn tan(float3);
	float4 __ovld __cnfn tan(float4);
	float8 __ovld __cnfn tan(float8);
	float16 __ovld __cnfn tan(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn tan(double);
	double2 __ovld __cnfn tan(double2);
	double3 __ovld __cnfn tan(double3);
	double4 __ovld __cnfn tan(double4);
	double8 __ovld __cnfn tan(double8);
	double16 __ovld __cnfn tan(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn tan(half);
	half2 __ovld __cnfn tan(half2);
	half3 __ovld __cnfn tan(half3);
	half4 __ovld __cnfn tan(half4);
	half8 __ovld __cnfn tan(half8);
	half16 __ovld __cnfn tan(half16);
	#endif //cl_khr_fp16

	/**
	* Compute hyperbolic tangent.
	*/
	float __ovld __cnfn tanh(float);
	float2 __ovld __cnfn tanh(float2);
	float3 __ovld __cnfn tanh(float3);
	float4 __ovld __cnfn tanh(float4);
	float8 __ovld __cnfn tanh(float8);
	float16 __ovld __cnfn tanh(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn tanh(double);
	double2 __ovld __cnfn tanh(double2);
	double3 __ovld __cnfn tanh(double3);
	double4 __ovld __cnfn tanh(double4);
	double8 __ovld __cnfn tanh(double8);
	double16 __ovld __cnfn tanh(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn tanh(half);
	half2 __ovld __cnfn tanh(half2);
	half3 __ovld __cnfn tanh(half3);
	half4 __ovld __cnfn tanh(half4);
	half8 __ovld __cnfn tanh(half8);
	half16 __ovld __cnfn tanh(half16);
	#endif //cl_khr_fp16

	/**
	* Compute tan (PI * x).
	*/
	float __ovld __cnfn tanpi(float x);
	float2 __ovld __cnfn tanpi(float2 x);
	float3 __ovld __cnfn tanpi(float3 x);
	float4 __ovld __cnfn tanpi(float4 x);
	float8 __ovld __cnfn tanpi(float8 x);
	float16 __ovld __cnfn tanpi(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn tanpi(double x);
	double2 __ovld __cnfn tanpi(double2 x);
	double3 __ovld __cnfn tanpi(double3 x);
	double4 __ovld __cnfn tanpi(double4 x);
	double8 __ovld __cnfn tanpi(double8 x);
	double16 __ovld __cnfn tanpi(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn tanpi(half x);
	half2 __ovld __cnfn tanpi(half2 x);
	half3 __ovld __cnfn tanpi(half3 x);
	half4 __ovld __cnfn tanpi(half4 x);
	half8 __ovld __cnfn tanpi(half8 x);
	half16 __ovld __cnfn tanpi(half16 x);
	#endif //cl_khr_fp16

	/**
	* Compute the gamma function.
	*/
	float __ovld __cnfn tgamma(float);
	float2 __ovld __cnfn tgamma(float2);
	float3 __ovld __cnfn tgamma(float3);
	float4 __ovld __cnfn tgamma(float4);
	float8 __ovld __cnfn tgamma(float8);
	float16 __ovld __cnfn tgamma(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn tgamma(double);
	double2 __ovld __cnfn tgamma(double2);
	double3 __ovld __cnfn tgamma(double3);
	double4 __ovld __cnfn tgamma(double4);
	double8 __ovld __cnfn tgamma(double8);
	double16 __ovld __cnfn tgamma(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn tgamma(half);
	half2 __ovld __cnfn tgamma(half2);
	half3 __ovld __cnfn tgamma(half3);
	half4 __ovld __cnfn tgamma(half4);
	half8 __ovld __cnfn tgamma(half8);
	half16 __ovld __cnfn tgamma(half16);
	#endif //cl_khr_fp16

	/**
	* Round to integral value using the round to zero
	* rounding mode.
	*/
	float __ovld __cnfn trunc(float);
	float2 __ovld __cnfn trunc(float2);
	float3 __ovld __cnfn trunc(float3);
	float4 __ovld __cnfn trunc(float4);
	float8 __ovld __cnfn trunc(float8);
	float16 __ovld __cnfn trunc(float16);
	#ifdef cl_khr_fp64
	double __ovld __cnfn trunc(double);
	double2 __ovld __cnfn trunc(double2);
	double3 __ovld __cnfn trunc(double3);
	double4 __ovld __cnfn trunc(double4);
	double8 __ovld __cnfn trunc(double8);
	double16 __ovld __cnfn trunc(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn trunc(half);
	half2 __ovld __cnfn trunc(half2);
	half3 __ovld __cnfn trunc(half3);
	half4 __ovld __cnfn trunc(half4);
	half8 __ovld __cnfn trunc(half8);
	half16 __ovld __cnfn trunc(half16);
	#endif //cl_khr_fp16

	/**
	* Compute cosine. x must be in the range -2^16 ... +2^16.
	*/
	float __ovld __cnfn half_cos(float x);
	float2 __ovld __cnfn half_cos(float2 x);
	float3 __ovld __cnfn half_cos(float3 x);
	float4 __ovld __cnfn half_cos(float4 x);
	float8 __ovld __cnfn half_cos(float8 x);
	float16 __ovld __cnfn half_cos(float16 x);

	/**
	* Compute x / y.
	*/
	float __ovld __cnfn half_divide(float x, float y);
	float2 __ovld __cnfn half_divide(float2 x, float2 y);
	float3 __ovld __cnfn half_divide(float3 x, float3 y);
	float4 __ovld __cnfn half_divide(float4 x, float4 y);
	float8 __ovld __cnfn half_divide(float8 x, float8 y);
	float16 __ovld __cnfn half_divide(float16 x, float16 y);

	/**
	* Compute the base- e exponential of x.
	*/
	float __ovld __cnfn half_exp(float x);
	float2 __ovld __cnfn half_exp(float2 x);
	float3 __ovld __cnfn half_exp(float3 x);
	float4 __ovld __cnfn half_exp(float4 x);
	float8 __ovld __cnfn half_exp(float8 x);
	float16 __ovld __cnfn half_exp(float16 x);

	/**
	* Compute the base- 2 exponential of x.
	*/
	float __ovld __cnfn half_exp2(float x);
	float2 __ovld __cnfn half_exp2(float2 x);
	float3 __ovld __cnfn half_exp2(float3 x);
	float4 __ovld __cnfn half_exp2(float4 x);
	float8 __ovld __cnfn half_exp2(float8 x);
	float16 __ovld __cnfn half_exp2(float16 x);

	/**
	* Compute the base- 10 exponential of x.
	*/
	float __ovld __cnfn half_exp10(float x);
	float2 __ovld __cnfn half_exp10(float2 x);
	float3 __ovld __cnfn half_exp10(float3 x);
	float4 __ovld __cnfn half_exp10(float4 x);
	float8 __ovld __cnfn half_exp10(float8 x);
	float16 __ovld __cnfn half_exp10(float16 x);

	/**
	* Compute natural logarithm.
	*/
	float __ovld __cnfn half_log(float x);
	float2 __ovld __cnfn half_log(float2 x);
	float3 __ovld __cnfn half_log(float3 x);
	float4 __ovld __cnfn half_log(float4 x);
	float8 __ovld __cnfn half_log(float8 x);
	float16 __ovld __cnfn half_log(float16 x);

	/**
	* Compute a base 2 logarithm.
	*/
	float __ovld __cnfn half_log2(float x);
	float2 __ovld __cnfn half_log2(float2 x);
	float3 __ovld __cnfn half_log2(float3 x);
	float4 __ovld __cnfn half_log2(float4 x);
	float8 __ovld __cnfn half_log2(float8 x);
	float16 __ovld __cnfn half_log2(float16 x);

	/**
	* Compute a base 10 logarithm.
	*/
	float __ovld __cnfn half_log10(float x);
	float2 __ovld __cnfn half_log10(float2 x);
	float3 __ovld __cnfn half_log10(float3 x);
	float4 __ovld __cnfn half_log10(float4 x);
	float8 __ovld __cnfn half_log10(float8 x);
	float16 __ovld __cnfn half_log10(float16 x);

	/**
	* Compute x to the power y, where x is >= 0.
	*/
	float __ovld __cnfn half_powr(float x, float y);
	float2 __ovld __cnfn half_powr(float2 x, float2 y);
	float3 __ovld __cnfn half_powr(float3 x, float3 y);
	float4 __ovld __cnfn half_powr(float4 x, float4 y);
	float8 __ovld __cnfn half_powr(float8 x, float8 y);
	float16 __ovld __cnfn half_powr(float16 x, float16 y);

	/**
	* Compute reciprocal.
	*/
	float __ovld __cnfn half_recip(float x);
	float2 __ovld __cnfn half_recip(float2 x);
	float3 __ovld __cnfn half_recip(float3 x);
	float4 __ovld __cnfn half_recip(float4 x);
	float8 __ovld __cnfn half_recip(float8 x);
	float16 __ovld __cnfn half_recip(float16 x);

	/**
	* Compute inverse square root.
	*/
	float __ovld __cnfn half_rsqrt(float x);
	float2 __ovld __cnfn half_rsqrt(float2 x);
	float3 __ovld __cnfn half_rsqrt(float3 x);
	float4 __ovld __cnfn half_rsqrt(float4 x);
	float8 __ovld __cnfn half_rsqrt(float8 x);
	float16 __ovld __cnfn half_rsqrt(float16 x);

	/**
	* Compute sine. x must be in the range -2^16 ... +2^16.
	*/
	float __ovld __cnfn half_sin(float x);
	float2 __ovld __cnfn half_sin(float2 x);
	float3 __ovld __cnfn half_sin(float3 x);
	float4 __ovld __cnfn half_sin(float4 x);
	float8 __ovld __cnfn half_sin(float8 x);
	float16 __ovld __cnfn half_sin(float16 x);

	/**
	* Compute square root.
	*/
	float __ovld __cnfn half_sqrt(float x);
	float2 __ovld __cnfn half_sqrt(float2 x);
	float3 __ovld __cnfn half_sqrt(float3 x);
	float4 __ovld __cnfn half_sqrt(float4 x);
	float8 __ovld __cnfn half_sqrt(float8 x);
	float16 __ovld __cnfn half_sqrt(float16 x);

	/**
	* Compute tangent. x must be in the range -216 ... +216.
	*/
	float __ovld __cnfn half_tan(float x);
	float2 __ovld __cnfn half_tan(float2 x);
	float3 __ovld __cnfn half_tan(float3 x);
	float4 __ovld __cnfn half_tan(float4 x);
	float8 __ovld __cnfn half_tan(float8 x);
	float16 __ovld __cnfn half_tan(float16 x);

	/**
	* Compute cosine over an implementation-defined range.
	* The maximum error is implementation-defined.
	*/
	float __ovld __cnfn native_cos(float x);
	float2 __ovld __cnfn native_cos(float2 x);
	float3 __ovld __cnfn native_cos(float3 x);
	float4 __ovld __cnfn native_cos(float4 x);
	float8 __ovld __cnfn native_cos(float8 x);
	float16 __ovld __cnfn native_cos(float16 x);

	/**
	* Compute x / y over an implementation-defined range.
	* The maximum error is implementation-defined.
	*/
	float __ovld __cnfn native_divide(float x, float y);
	float2 __ovld __cnfn native_divide(float2 x, float2 y);
	float3 __ovld __cnfn native_divide(float3 x, float3 y);
	float4 __ovld __cnfn native_divide(float4 x, float4 y);
	float8 __ovld __cnfn native_divide(float8 x, float8 y);
	float16 __ovld __cnfn native_divide(float16 x, float16 y);

	/**
	* Compute the base- e exponential of x over an
	* implementation-defined range. The maximum error is
	* implementation-defined.
	*/
	float __ovld __cnfn native_exp(float x);
	float2 __ovld __cnfn native_exp(float2 x);
	float3 __ovld __cnfn native_exp(float3 x);
	float4 __ovld __cnfn native_exp(float4 x);
	float8 __ovld __cnfn native_exp(float8 x);
	float16 __ovld __cnfn native_exp(float16 x);

	/**
	* Compute the base- 2 exponential of x over an
	* implementation-defined range. The maximum error is
	* implementation-defined.
	*/
	float __ovld __cnfn native_exp2(float x);
	float2 __ovld __cnfn native_exp2(float2 x);
	float3 __ovld __cnfn native_exp2(float3 x);
	float4 __ovld __cnfn native_exp2(float4 x);
	float8 __ovld __cnfn native_exp2(float8 x);
	float16 __ovld __cnfn native_exp2(float16 x);

	/**
	* Compute the base- 10 exponential of x over an
	* implementation-defined range. The maximum error is
	* implementation-defined.
	*/
	float __ovld __cnfn native_exp10(float x);
	float2 __ovld __cnfn native_exp10(float2 x);
	float3 __ovld __cnfn native_exp10(float3 x);
	float4 __ovld __cnfn native_exp10(float4 x);
	float8 __ovld __cnfn native_exp10(float8 x);
	float16 __ovld __cnfn native_exp10(float16 x);

	/**
	* Compute natural logarithm over an implementationdefined
	* range. The maximum error is implementation
	* defined.
	*/
	float __ovld __cnfn native_log(float x);
	float2 __ovld __cnfn native_log(float2 x);
	float3 __ovld __cnfn native_log(float3 x);
	float4 __ovld __cnfn native_log(float4 x);
	float8 __ovld __cnfn native_log(float8 x);
	float16 __ovld __cnfn native_log(float16 x);

	/**
	* Compute a base 2 logarithm over an implementationdefined
	* range. The maximum error is implementationdefined.
	*/
	float __ovld __cnfn native_log2(float x);
	float2 __ovld __cnfn native_log2(float2 x);
	float3 __ovld __cnfn native_log2(float3 x);
	float4 __ovld __cnfn native_log2(float4 x);
	float8 __ovld __cnfn native_log2(float8 x);
	float16 __ovld __cnfn native_log2(float16 x);

	/**
	* Compute a base 10 logarithm over an implementationdefined
	* range. The maximum error is implementationdefined.
	*/
	float __ovld __cnfn native_log10(float x);
	float2 __ovld __cnfn native_log10(float2 x);
	float3 __ovld __cnfn native_log10(float3 x);
	float4 __ovld __cnfn native_log10(float4 x);
	float8 __ovld __cnfn native_log10(float8 x);
	float16 __ovld __cnfn native_log10(float16 x);

	/**
	* Compute x to the power y, where x is >= 0. The range of
	* x and y are implementation-defined. The maximum error
	* is implementation-defined.
	*/
	float __ovld __cnfn native_powr(float x, float y);
	float2 __ovld __cnfn native_powr(float2 x, float2 y);
	float3 __ovld __cnfn native_powr(float3 x, float3 y);
	float4 __ovld __cnfn native_powr(float4 x, float4 y);
	float8 __ovld __cnfn native_powr(float8 x, float8 y);
	float16 __ovld __cnfn native_powr(float16 x, float16 y);

	/**
	* Compute reciprocal over an implementation-defined
	* range. The maximum error is implementation-defined.
	*/
	float __ovld __cnfn native_recip(float x);
	float2 __ovld __cnfn native_recip(float2 x);
	float3 __ovld __cnfn native_recip(float3 x);
	float4 __ovld __cnfn native_recip(float4 x);
	float8 __ovld __cnfn native_recip(float8 x);
	float16 __ovld __cnfn native_recip(float16 x);

	/**
	* Compute inverse square root over an implementationdefined
	* range. The maximum error is implementationdefined.
	*/
	float __ovld __cnfn native_rsqrt(float x);
	float2 __ovld __cnfn native_rsqrt(float2 x);
	float3 __ovld __cnfn native_rsqrt(float3 x);
	float4 __ovld __cnfn native_rsqrt(float4 x);
	float8 __ovld __cnfn native_rsqrt(float8 x);
	float16 __ovld __cnfn native_rsqrt(float16 x);

	/**
	* Compute sine over an implementation-defined range.
	* The maximum error is implementation-defined.
	*/
	float __ovld __cnfn native_sin(float x);
	float2 __ovld __cnfn native_sin(float2 x);
	float3 __ovld __cnfn native_sin(float3 x);
	float4 __ovld __cnfn native_sin(float4 x);
	float8 __ovld __cnfn native_sin(float8 x);
	float16 __ovld __cnfn native_sin(float16 x);

	/**
	* Compute square root over an implementation-defined
	* range. The maximum error is implementation-defined.
	*/
	float __ovld __cnfn native_sqrt(float x);
	float2 __ovld __cnfn native_sqrt(float2 x);
	float3 __ovld __cnfn native_sqrt(float3 x);
	float4 __ovld __cnfn native_sqrt(float4 x);
	float8 __ovld __cnfn native_sqrt(float8 x);
	float16 __ovld __cnfn native_sqrt(float16 x);

	/**
	* Compute tangent over an implementation-defined range.
	* The maximum error is implementation-defined.
	*/
	float __ovld __cnfn native_tan(float x);
	float2 __ovld __cnfn native_tan(float2 x);
	float3 __ovld __cnfn native_tan(float3 x);
	float4 __ovld __cnfn native_tan(float4 x);
	float8 __ovld __cnfn native_tan(float8 x);
	float16 __ovld __cnfn native_tan(float16 x);

	// OpenCL v1.1 s6.11.3, v1.2 s6.12.3, v2.0 s6.13.3 - Integer Functions

	/**
	* Returns \| x \|.
	*/
	uchar __ovld __cnfn abs(char x);
	uchar __ovld __cnfn abs(uchar x);
	uchar2 __ovld __cnfn abs(char2 x);
	uchar2 __ovld __cnfn abs(uchar2 x);
	uchar3 __ovld __cnfn abs(char3 x);
	uchar3 __ovld __cnfn abs(uchar3 x);
	uchar4 __ovld __cnfn abs(char4 x);
	uchar4 __ovld __cnfn abs(uchar4 x);
	uchar8 __ovld __cnfn abs(char8 x);
	uchar8 __ovld __cnfn abs(uchar8 x);
	uchar16 __ovld __cnfn abs(char16 x);
	uchar16 __ovld __cnfn abs(uchar16 x);
	ushort __ovld __cnfn abs(short x);
	ushort __ovld __cnfn abs(ushort x);
	ushort2 __ovld __cnfn abs(short2 x);
	ushort2 __ovld __cnfn abs(ushort2 x);
	ushort3 __ovld __cnfn abs(short3 x);
	ushort3 __ovld __cnfn abs(ushort3 x);
	ushort4 __ovld __cnfn abs(short4 x);
	ushort4 __ovld __cnfn abs(ushort4 x);
	ushort8 __ovld __cnfn abs(short8 x);
	ushort8 __ovld __cnfn abs(ushort8 x);
	ushort16 __ovld __cnfn abs(short16 x);
	ushort16 __ovld __cnfn abs(ushort16 x);
	uint __ovld __cnfn abs(int x);
	uint __ovld __cnfn abs(uint x);
	uint2 __ovld __cnfn abs(int2 x);
	uint2 __ovld __cnfn abs(uint2 x);
	uint3 __ovld __cnfn abs(int3 x);
	uint3 __ovld __cnfn abs(uint3 x);
	uint4 __ovld __cnfn abs(int4 x);
	uint4 __ovld __cnfn abs(uint4 x);
	uint8 __ovld __cnfn abs(int8 x);
	uint8 __ovld __cnfn abs(uint8 x);
	uint16 __ovld __cnfn abs(int16 x);
	uint16 __ovld __cnfn abs(uint16 x);
	ulong __ovld __cnfn abs(long x);
	ulong __ovld __cnfn abs(ulong x);
	ulong2 __ovld __cnfn abs(long2 x);
	ulong2 __ovld __cnfn abs(ulong2 x);
	ulong3 __ovld __cnfn abs(long3 x);
	ulong3 __ovld __cnfn abs(ulong3 x);
	ulong4 __ovld __cnfn abs(long4 x);
	ulong4 __ovld __cnfn abs(ulong4 x);
	ulong8 __ovld __cnfn abs(long8 x);
	ulong8 __ovld __cnfn abs(ulong8 x);
	ulong16 __ovld __cnfn abs(long16 x);
	ulong16 __ovld __cnfn abs(ulong16 x);

	/**
	* Returns \| x - y \| without modulo overflow.
	*/
	uchar __ovld __cnfn abs_diff(char x, char y);
	uchar __ovld __cnfn abs_diff(uchar x, uchar y);
	uchar2 __ovld __cnfn abs_diff(char2 x, char2 y);
	uchar2 __ovld __cnfn abs_diff(uchar2 x, uchar2 y);
	uchar3 __ovld __cnfn abs_diff(char3 x, char3 y);
	uchar3 __ovld __cnfn abs_diff(uchar3 x, uchar3 y);
	uchar4 __ovld __cnfn abs_diff(char4 x, char4 y);
	uchar4 __ovld __cnfn abs_diff(uchar4 x, uchar4 y);
	uchar8 __ovld __cnfn abs_diff(char8 x, char8 y);
	uchar8 __ovld __cnfn abs_diff(uchar8 x, uchar8 y);
	uchar16 __ovld __cnfn abs_diff(char16 x, char16 y);
	uchar16 __ovld __cnfn abs_diff(uchar16 x, uchar16 y);
	ushort __ovld __cnfn abs_diff(short x, short y);
	ushort __ovld __cnfn abs_diff(ushort x, ushort y);
	ushort2 __ovld __cnfn abs_diff(short2 x, short2 y);
	ushort2 __ovld __cnfn abs_diff(ushort2 x, ushort2 y);
	ushort3 __ovld __cnfn abs_diff(short3 x, short3 y);
	ushort3 __ovld __cnfn abs_diff(ushort3 x, ushort3 y);
	ushort4 __ovld __cnfn abs_diff(short4 x, short4 y);
	ushort4 __ovld __cnfn abs_diff(ushort4 x, ushort4 y);
	ushort8 __ovld __cnfn abs_diff(short8 x, short8 y);
	ushort8 __ovld __cnfn abs_diff(ushort8 x, ushort8 y);
	ushort16 __ovld __cnfn abs_diff(short16 x, short16 y);
	ushort16 __ovld __cnfn abs_diff(ushort16 x, ushort16 y);
	uint __ovld __cnfn abs_diff(int x, int y);
	uint __ovld __cnfn abs_diff(uint x, uint y);
	uint2 __ovld __cnfn abs_diff(int2 x, int2 y);
	uint2 __ovld __cnfn abs_diff(uint2 x, uint2 y);
	uint3 __ovld __cnfn abs_diff(int3 x, int3 y);
	uint3 __ovld __cnfn abs_diff(uint3 x, uint3 y);
	uint4 __ovld __cnfn abs_diff(int4 x, int4 y);
	uint4 __ovld __cnfn abs_diff(uint4 x, uint4 y);
	uint8 __ovld __cnfn abs_diff(int8 x, int8 y);
	uint8 __ovld __cnfn abs_diff(uint8 x, uint8 y);
	uint16 __ovld __cnfn abs_diff(int16 x, int16 y);
	uint16 __ovld __cnfn abs_diff(uint16 x, uint16 y);
	ulong __ovld __cnfn abs_diff(long x, long y);
	ulong __ovld __cnfn abs_diff(ulong x, ulong y);
	ulong2 __ovld __cnfn abs_diff(long2 x, long2 y);
	ulong2 __ovld __cnfn abs_diff(ulong2 x, ulong2 y);
	ulong3 __ovld __cnfn abs_diff(long3 x, long3 y);
	ulong3 __ovld __cnfn abs_diff(ulong3 x, ulong3 y);
	ulong4 __ovld __cnfn abs_diff(long4 x, long4 y);
	ulong4 __ovld __cnfn abs_diff(ulong4 x, ulong4 y);
	ulong8 __ovld __cnfn abs_diff(long8 x, long8 y);
	ulong8 __ovld __cnfn abs_diff(ulong8 x, ulong8 y);
	ulong16 __ovld __cnfn abs_diff(long16 x, long16 y);
	ulong16 __ovld __cnfn abs_diff(ulong16 x, ulong16 y);

	/**
	* Returns x + y and saturates the result.
	*/
	char __ovld __cnfn add_sat(char x, char y);
	uchar __ovld __cnfn add_sat(uchar x, uchar y);
	char2 __ovld __cnfn add_sat(char2 x, char2 y);
	uchar2 __ovld __cnfn add_sat(uchar2 x, uchar2 y);
	char3 __ovld __cnfn add_sat(char3 x, char3 y);
	uchar3 __ovld __cnfn add_sat(uchar3 x, uchar3 y);
	char4 __ovld __cnfn add_sat(char4 x, char4 y);
	uchar4 __ovld __cnfn add_sat(uchar4 x, uchar4 y);
	char8 __ovld __cnfn add_sat(char8 x, char8 y);
	uchar8 __ovld __cnfn add_sat(uchar8 x, uchar8 y);
	char16 __ovld __cnfn add_sat(char16 x, char16 y);
	uchar16 __ovld __cnfn add_sat(uchar16 x, uchar16 y);
	short __ovld __cnfn add_sat(short x, short y);
	ushort __ovld __cnfn add_sat(ushort x, ushort y);
	short2 __ovld __cnfn add_sat(short2 x, short2 y);
	ushort2 __ovld __cnfn add_sat(ushort2 x, ushort2 y);
	short3 __ovld __cnfn add_sat(short3 x, short3 y);
	ushort3 __ovld __cnfn add_sat(ushort3 x, ushort3 y);
	short4 __ovld __cnfn add_sat(short4 x, short4 y);
	ushort4 __ovld __cnfn add_sat(ushort4 x, ushort4 y);
	short8 __ovld __cnfn add_sat(short8 x, short8 y);
	ushort8 __ovld __cnfn add_sat(ushort8 x, ushort8 y);
	short16 __ovld __cnfn add_sat(short16 x, short16 y);
	ushort16 __ovld __cnfn add_sat(ushort16 x, ushort16 y);
	int __ovld __cnfn add_sat(int x, int y);
	uint __ovld __cnfn add_sat(uint x, uint y);
	int2 __ovld __cnfn add_sat(int2 x, int2 y);
	uint2 __ovld __cnfn add_sat(uint2 x, uint2 y);
	int3 __ovld __cnfn add_sat(int3 x, int3 y);
	uint3 __ovld __cnfn add_sat(uint3 x, uint3 y);
	int4 __ovld __cnfn add_sat(int4 x, int4 y);
	uint4 __ovld __cnfn add_sat(uint4 x, uint4 y);
	int8 __ovld __cnfn add_sat(int8 x, int8 y);
	uint8 __ovld __cnfn add_sat(uint8 x, uint8 y);
	int16 __ovld __cnfn add_sat(int16 x, int16 y);
	uint16 __ovld __cnfn add_sat(uint16 x, uint16 y);
	long __ovld __cnfn add_sat(long x, long y);
	ulong __ovld __cnfn add_sat(ulong x, ulong y);
	long2 __ovld __cnfn add_sat(long2 x, long2 y);
	ulong2 __ovld __cnfn add_sat(ulong2 x, ulong2 y);
	long3 __ovld __cnfn add_sat(long3 x, long3 y);
	ulong3 __ovld __cnfn add_sat(ulong3 x, ulong3 y);
	long4 __ovld __cnfn add_sat(long4 x, long4 y);
	ulong4 __ovld __cnfn add_sat(ulong4 x, ulong4 y);
	long8 __ovld __cnfn add_sat(long8 x, long8 y);
	ulong8 __ovld __cnfn add_sat(ulong8 x, ulong8 y);
	long16 __ovld __cnfn add_sat(long16 x, long16 y);
	ulong16 __ovld __cnfn add_sat(ulong16 x, ulong16 y);

	/**
	* Returns (x + y) >> 1. The intermediate sum does
	* not modulo overflow.
	*/
	char __ovld __cnfn hadd(char x, char y);
	uchar __ovld __cnfn hadd(uchar x, uchar y);
	char2 __ovld __cnfn hadd(char2 x, char2 y);
	uchar2 __ovld __cnfn hadd(uchar2 x, uchar2 y);
	char3 __ovld __cnfn hadd(char3 x, char3 y);
	uchar3 __ovld __cnfn hadd(uchar3 x, uchar3 y);
	char4 __ovld __cnfn hadd(char4 x, char4 y);
	uchar4 __ovld __cnfn hadd(uchar4 x, uchar4 y);
	char8 __ovld __cnfn hadd(char8 x, char8 y);
	uchar8 __ovld __cnfn hadd(uchar8 x, uchar8 y);
	char16 __ovld __cnfn hadd(char16 x, char16 y);
	uchar16 __ovld __cnfn hadd(uchar16 x, uchar16 y);
	short __ovld __cnfn hadd(short x, short y);
	ushort __ovld __cnfn hadd(ushort x, ushort y);
	short2 __ovld __cnfn hadd(short2 x, short2 y);
	ushort2 __ovld __cnfn hadd(ushort2 x, ushort2 y);
	short3 __ovld __cnfn hadd(short3 x, short3 y);
	ushort3 __ovld __cnfn hadd(ushort3 x, ushort3 y);
	short4 __ovld __cnfn hadd(short4 x, short4 y);
	ushort4 __ovld __cnfn hadd(ushort4 x, ushort4 y);
	short8 __ovld __cnfn hadd(short8 x, short8 y);
	ushort8 __ovld __cnfn hadd(ushort8 x, ushort8 y);
	short16 __ovld __cnfn hadd(short16 x, short16 y);
	ushort16 __ovld __cnfn hadd(ushort16 x, ushort16 y);
	int __ovld __cnfn hadd(int x, int y);
	uint __ovld __cnfn hadd(uint x, uint y);
	int2 __ovld __cnfn hadd(int2 x, int2 y);
	uint2 __ovld __cnfn hadd(uint2 x, uint2 y);
	int3 __ovld __cnfn hadd(int3 x, int3 y);
	uint3 __ovld __cnfn hadd(uint3 x, uint3 y);
	int4 __ovld __cnfn hadd(int4 x, int4 y);
	uint4 __ovld __cnfn hadd(uint4 x, uint4 y);
	int8 __ovld __cnfn hadd(int8 x, int8 y);
	uint8 __ovld __cnfn hadd(uint8 x, uint8 y);
	int16 __ovld __cnfn hadd(int16 x, int16 y);
	uint16 __ovld __cnfn hadd(uint16 x, uint16 y);
	long __ovld __cnfn hadd(long x, long y);
	ulong __ovld __cnfn hadd(ulong x, ulong y);
	long2 __ovld __cnfn hadd(long2 x, long2 y);
	ulong2 __ovld __cnfn hadd(ulong2 x, ulong2 y);
	long3 __ovld __cnfn hadd(long3 x, long3 y);
	ulong3 __ovld __cnfn hadd(ulong3 x, ulong3 y);
	long4 __ovld __cnfn hadd(long4 x, long4 y);
	ulong4 __ovld __cnfn hadd(ulong4 x, ulong4 y);
	long8 __ovld __cnfn hadd(long8 x, long8 y);
	ulong8 __ovld __cnfn hadd(ulong8 x, ulong8 y);
	long16 __ovld __cnfn hadd(long16 x, long16 y);
	ulong16 __ovld __cnfn hadd(ulong16 x, ulong16 y);

	/**
	* Returns (x + y + 1) >> 1. The intermediate sum
	* does not modulo overflow.
	*/
	char __ovld __cnfn rhadd(char x, char y);
	uchar __ovld __cnfn rhadd(uchar x, uchar y);
	char2 __ovld __cnfn rhadd(char2 x, char2 y);
	uchar2 __ovld __cnfn rhadd(uchar2 x, uchar2 y);
	char3 __ovld __cnfn rhadd(char3 x, char3 y);
	uchar3 __ovld __cnfn rhadd(uchar3 x, uchar3 y);
	char4 __ovld __cnfn rhadd(char4 x, char4 y);
	uchar4 __ovld __cnfn rhadd(uchar4 x, uchar4 y);
	char8 __ovld __cnfn rhadd(char8 x, char8 y);
	uchar8 __ovld __cnfn rhadd(uchar8 x, uchar8 y);
	char16 __ovld __cnfn rhadd(char16 x, char16 y);
	uchar16 __ovld __cnfn rhadd(uchar16 x, uchar16 y);
	short __ovld __cnfn rhadd(short x, short y);
	ushort __ovld __cnfn rhadd(ushort x, ushort y);
	short2 __ovld __cnfn rhadd(short2 x, short2 y);
	ushort2 __ovld __cnfn rhadd(ushort2 x, ushort2 y);
	short3 __ovld __cnfn rhadd(short3 x, short3 y);
	ushort3 __ovld __cnfn rhadd(ushort3 x, ushort3 y);
	short4 __ovld __cnfn rhadd(short4 x, short4 y);
	ushort4 __ovld __cnfn rhadd(ushort4 x, ushort4 y);
	short8 __ovld __cnfn rhadd(short8 x, short8 y);
	ushort8 __ovld __cnfn rhadd(ushort8 x, ushort8 y);
	short16 __ovld __cnfn rhadd(short16 x, short16 y);
	ushort16 __ovld __cnfn rhadd(ushort16 x, ushort16 y);
	int __ovld __cnfn rhadd(int x, int y);
	uint __ovld __cnfn rhadd(uint x, uint y);
	int2 __ovld __cnfn rhadd(int2 x, int2 y);
	uint2 __ovld __cnfn rhadd(uint2 x, uint2 y);
	int3 __ovld __cnfn rhadd(int3 x, int3 y);
	uint3 __ovld __cnfn rhadd(uint3 x, uint3 y);
	int4 __ovld __cnfn rhadd(int4 x, int4 y);
	uint4 __ovld __cnfn rhadd(uint4 x, uint4 y);
	int8 __ovld __cnfn rhadd(int8 x, int8 y);
	uint8 __ovld __cnfn rhadd(uint8 x, uint8 y);
	int16 __ovld __cnfn rhadd(int16 x, int16 y);
	uint16 __ovld __cnfn rhadd(uint16 x, uint16 y);
	long __ovld __cnfn rhadd(long x, long y);
	ulong __ovld __cnfn rhadd(ulong x, ulong y);
	long2 __ovld __cnfn rhadd(long2 x, long2 y);
	ulong2 __ovld __cnfn rhadd(ulong2 x, ulong2 y);
	long3 __ovld __cnfn rhadd(long3 x, long3 y);
	ulong3 __ovld __cnfn rhadd(ulong3 x, ulong3 y);
	long4 __ovld __cnfn rhadd(long4 x, long4 y);
	ulong4 __ovld __cnfn rhadd(ulong4 x, ulong4 y);
	long8 __ovld __cnfn rhadd(long8 x, long8 y);
	ulong8 __ovld __cnfn rhadd(ulong8 x, ulong8 y);
	long16 __ovld __cnfn rhadd(long16 x, long16 y);
	ulong16 __ovld __cnfn rhadd(ulong16 x, ulong16 y);

	/**
	* Returns min(max(x, minval), maxval).
	* Results are undefined if minval > maxval.
	*/
	char __ovld __cnfn clamp(char x, char minval, char maxval);
	uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
	char2 __ovld __cnfn clamp(char2 x, char2 minval, char2 maxval);
	uchar2 __ovld __cnfn clamp(uchar2 x, uchar2 minval, uchar2 maxval);
	char3 __ovld __cnfn clamp(char3 x, char3 minval, char3 maxval);
	uchar3 __ovld __cnfn clamp(uchar3 x, uchar3 minval, uchar3 maxval);
	char4 __ovld __cnfn clamp(char4 x, char4 minval, char4 maxval);
	uchar4 __ovld __cnfn clamp(uchar4 x, uchar4 minval, uchar4 maxval);
	char8 __ovld __cnfn clamp(char8 x, char8 minval, char8 maxval);
	uchar8 __ovld __cnfn clamp(uchar8 x, uchar8 minval, uchar8 maxval);
	char16 __ovld __cnfn clamp(char16 x, char16 minval, char16 maxval);
	uchar16 __ovld __cnfn clamp(uchar16 x, uchar16 minval, uchar16 maxval);
	short __ovld __cnfn clamp(short x, short minval, short maxval);
	ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
	short2 __ovld __cnfn clamp(short2 x, short2 minval, short2 maxval);
	ushort2 __ovld __cnfn clamp(ushort2 x, ushort2 minval, ushort2 maxval);
	short3 __ovld __cnfn clamp(short3 x, short3 minval, short3 maxval);
	ushort3 __ovld __cnfn clamp(ushort3 x, ushort3 minval, ushort3 maxval);
	short4 __ovld __cnfn clamp(short4 x, short4 minval, short4 maxval);
	ushort4 __ovld __cnfn clamp(ushort4 x, ushort4 minval, ushort4 maxval);
	short8 __ovld __cnfn clamp(short8 x, short8 minval, short8 maxval);
	ushort8 __ovld __cnfn clamp(ushort8 x, ushort8 minval, ushort8 maxval);
	short16 __ovld __cnfn clamp(short16 x, short16 minval, short16 maxval);
	ushort16 __ovld __cnfn clamp(ushort16 x, ushort16 minval, ushort16 maxval);
	int __ovld __cnfn clamp(int x, int minval, int maxval);
	uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
	int2 __ovld __cnfn clamp(int2 x, int2 minval, int2 maxval);
	uint2 __ovld __cnfn clamp(uint2 x, uint2 minval, uint2 maxval);
	int3 __ovld __cnfn clamp(int3 x, int3 minval, int3 maxval);
	uint3 __ovld __cnfn clamp(uint3 x, uint3 minval, uint3 maxval);
	int4 __ovld __cnfn clamp(int4 x, int4 minval, int4 maxval);
	uint4 __ovld __cnfn clamp(uint4 x, uint4 minval, uint4 maxval);
	int8 __ovld __cnfn clamp(int8 x, int8 minval, int8 maxval);
	uint8 __ovld __cnfn clamp(uint8 x, uint8 minval, uint8 maxval);
	int16 __ovld __cnfn clamp(int16 x, int16 minval, int16 maxval);
	uint16 __ovld __cnfn clamp(uint16 x, uint16 minval, uint16 maxval);
	long __ovld __cnfn clamp(long x, long minval, long maxval);
	ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
	long2 __ovld __cnfn clamp(long2 x, long2 minval, long2 maxval);
	ulong2 __ovld __cnfn clamp(ulong2 x, ulong2 minval, ulong2 maxval);
	long3 __ovld __cnfn clamp(long3 x, long3 minval, long3 maxval);
	ulong3 __ovld __cnfn clamp(ulong3 x, ulong3 minval, ulong3 maxval);
	long4 __ovld __cnfn clamp(long4 x, long4 minval, long4 maxval);
	ulong4 __ovld __cnfn clamp(ulong4 x, ulong4 minval, ulong4 maxval);
	long8 __ovld __cnfn clamp(long8 x, long8 minval, long8 maxval);
	ulong8 __ovld __cnfn clamp(ulong8 x, ulong8 minval, ulong8 maxval);
	long16 __ovld __cnfn clamp(long16 x, long16 minval, long16 maxval);
	ulong16 __ovld __cnfn clamp(ulong16 x, ulong16 minval, ulong16 maxval);
	char2 __ovld __cnfn clamp(char2 x, char minval, char maxval);
	uchar2 __ovld __cnfn clamp(uchar2 x, uchar minval, uchar maxval);
	char3 __ovld __cnfn clamp(char3 x, char minval, char maxval);
	uchar3 __ovld __cnfn clamp(uchar3 x, uchar minval, uchar maxval);
	char4 __ovld __cnfn clamp(char4 x, char minval, char maxval);
	uchar4 __ovld __cnfn clamp(uchar4 x, uchar minval, uchar maxval);
	char8 __ovld __cnfn clamp(char8 x, char minval, char maxval);
	uchar8 __ovld __cnfn clamp(uchar8 x, uchar minval, uchar maxval);
	char16 __ovld __cnfn clamp(char16 x, char minval, char maxval);
	uchar16 __ovld __cnfn clamp(uchar16 x, uchar minval, uchar maxval);
	short2 __ovld __cnfn clamp(short2 x, short minval, short maxval);
	ushort2 __ovld __cnfn clamp(ushort2 x, ushort minval, ushort maxval);
	short3 __ovld __cnfn clamp(short3 x, short minval, short maxval);
	ushort3 __ovld __cnfn clamp(ushort3 x, ushort minval, ushort maxval);
	short4 __ovld __cnfn clamp(short4 x, short minval, short maxval);
	ushort4 __ovld __cnfn clamp(ushort4 x, ushort minval, ushort maxval);
	short8 __ovld __cnfn clamp(short8 x, short minval, short maxval);
	ushort8 __ovld __cnfn clamp(ushort8 x, ushort minval, ushort maxval);
	short16 __ovld __cnfn clamp(short16 x, short minval, short maxval);
	ushort16 __ovld __cnfn clamp(ushort16 x, ushort minval, ushort maxval);
	int2 __ovld __cnfn clamp(int2 x, int minval, int maxval);
	uint2 __ovld __cnfn clamp(uint2 x, uint minval, uint maxval);
	int3 __ovld __cnfn clamp(int3 x, int minval, int maxval);
	uint3 __ovld __cnfn clamp(uint3 x, uint minval, uint maxval);
	int4 __ovld __cnfn clamp(int4 x, int minval, int maxval);
	uint4 __ovld __cnfn clamp(uint4 x, uint minval, uint maxval);
	int8 __ovld __cnfn clamp(int8 x, int minval, int maxval);
	uint8 __ovld __cnfn clamp(uint8 x, uint minval, uint maxval);
	int16 __ovld __cnfn clamp(int16 x, int minval, int maxval);
	uint16 __ovld __cnfn clamp(uint16 x, uint minval, uint maxval);
	long2 __ovld __cnfn clamp(long2 x, long minval, long maxval);
	ulong2 __ovld __cnfn clamp(ulong2 x, ulong minval, ulong maxval);
	long3 __ovld __cnfn clamp(long3 x, long minval, long maxval);
	ulong3 __ovld __cnfn clamp(ulong3 x, ulong minval, ulong maxval);
	long4 __ovld __cnfn clamp(long4 x, long minval, long maxval);
	ulong4 __ovld __cnfn clamp(ulong4 x, ulong minval, ulong maxval);
	long8 __ovld __cnfn clamp(long8 x, long minval, long maxval);
	ulong8 __ovld __cnfn clamp(ulong8 x, ulong minval, ulong maxval);
	long16 __ovld __cnfn clamp(long16 x, long minval, long maxval);
	ulong16 __ovld __cnfn clamp(ulong16 x, ulong minval, ulong maxval);

	/**
	* Returns the number of leading 0-bits in x, starting
	* at the most significant bit position.
	*/
	char __ovld __cnfn clz(char x);
	uchar __ovld __cnfn clz(uchar x);
	char2 __ovld __cnfn clz(char2 x);
	uchar2 __ovld __cnfn clz(uchar2 x);
	char3 __ovld __cnfn clz(char3 x);
	uchar3 __ovld __cnfn clz(uchar3 x);
	char4 __ovld __cnfn clz(char4 x);
	uchar4 __ovld __cnfn clz(uchar4 x);
	char8 __ovld __cnfn clz(char8 x);
	uchar8 __ovld __cnfn clz(uchar8 x);
	char16 __ovld __cnfn clz(char16 x);
	uchar16 __ovld __cnfn clz(uchar16 x);
	short __ovld __cnfn clz(short x);
	ushort __ovld __cnfn clz(ushort x);
	short2 __ovld __cnfn clz(short2 x);
	ushort2 __ovld __cnfn clz(ushort2 x);
	short3 __ovld __cnfn clz(short3 x);
	ushort3 __ovld __cnfn clz(ushort3 x);
	short4 __ovld __cnfn clz(short4 x);
	ushort4 __ovld __cnfn clz(ushort4 x);
	short8 __ovld __cnfn clz(short8 x);
	ushort8 __ovld __cnfn clz(ushort8 x);
	short16 __ovld __cnfn clz(short16 x);
	ushort16 __ovld __cnfn clz(ushort16 x);
	int __ovld __cnfn clz(int x);
	uint __ovld __cnfn clz(uint x);
	int2 __ovld __cnfn clz(int2 x);
	uint2 __ovld __cnfn clz(uint2 x);
	int3 __ovld __cnfn clz(int3 x);
	uint3 __ovld __cnfn clz(uint3 x);
	int4 __ovld __cnfn clz(int4 x);
	uint4 __ovld __cnfn clz(uint4 x);
	int8 __ovld __cnfn clz(int8 x);
	uint8 __ovld __cnfn clz(uint8 x);
	int16 __ovld __cnfn clz(int16 x);
	uint16 __ovld __cnfn clz(uint16 x);
	long __ovld __cnfn clz(long x);
	ulong __ovld __cnfn clz(ulong x);
	long2 __ovld __cnfn clz(long2 x);
	ulong2 __ovld __cnfn clz(ulong2 x);
	long3 __ovld __cnfn clz(long3 x);
	ulong3 __ovld __cnfn clz(ulong3 x);
	long4 __ovld __cnfn clz(long4 x);
	ulong4 __ovld __cnfn clz(ulong4 x);
	long8 __ovld __cnfn clz(long8 x);
	ulong8 __ovld __cnfn clz(ulong8 x);
	long16 __ovld __cnfn clz(long16 x);
	ulong16 __ovld __cnfn clz(ulong16 x);

	/**
	* Returns the count of trailing 0-bits in x. If x is 0,
	* returns the size in bits of the type of x or
	* component type of x, if x is a vector.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	char __ovld ctz(char x);
	uchar __ovld ctz(uchar x);
	char2 __ovld ctz(char2 x);
	uchar2 __ovld ctz(uchar2 x);
	char3 __ovld ctz(char3 x);
	uchar3 __ovld ctz(uchar3 x);
	char4 __ovld ctz(char4 x);
	uchar4 __ovld ctz(uchar4 x);
	char8 __ovld ctz(char8 x);
	uchar8 __ovld ctz(uchar8 x);
	char16 __ovld ctz(char16 x);
	uchar16 __ovld ctz(uchar16 x);
	short __ovld ctz(short x);
	ushort __ovld ctz(ushort x);
	short2 __ovld ctz(short2 x);
	ushort2 __ovld ctz(ushort2 x);
	short3 __ovld ctz(short3 x);
	ushort3 __ovld ctz(ushort3 x);
	short4 __ovld ctz(short4 x);
	ushort4 __ovld ctz(ushort4 x);
	short8 __ovld ctz(short8 x);
	ushort8 __ovld ctz(ushort8 x);
	short16 __ovld ctz(short16 x);
	ushort16 __ovld ctz(ushort16 x);
	int __ovld ctz(int x);
	uint __ovld ctz(uint x);
	int2 __ovld ctz(int2 x);
	uint2 __ovld ctz(uint2 x);
	int3 __ovld ctz(int3 x);
	uint3 __ovld ctz(uint3 x);
	int4 __ovld ctz(int4 x);
	uint4 __ovld ctz(uint4 x);
	int8 __ovld ctz(int8 x);
	uint8 __ovld ctz(uint8 x);
	int16 __ovld ctz(int16 x);
	uint16 __ovld ctz(uint16 x);
	long __ovld ctz(long x);
	ulong __ovld ctz(ulong x);
	long2 __ovld ctz(long2 x);
	ulong2 __ovld ctz(ulong2 x);
	long3 __ovld ctz(long3 x);
	ulong3 __ovld ctz(ulong3 x);
	long4 __ovld ctz(long4 x);
	ulong4 __ovld ctz(ulong4 x);
	long8 __ovld ctz(long8 x);
	ulong8 __ovld ctz(ulong8 x);
	long16 __ovld ctz(long16 x);
	ulong16 __ovld ctz(ulong16 x);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Returns mul_hi(a, b) + c.
	*/
	char __ovld __cnfn mad_hi(char a, char b, char c);
	uchar __ovld __cnfn mad_hi(uchar a, uchar b, uchar c);
	char2 __ovld __cnfn mad_hi(char2 a, char2 b, char2 c);
	uchar2 __ovld __cnfn mad_hi(uchar2 a, uchar2 b, uchar2 c);
	char3 __ovld __cnfn mad_hi(char3 a, char3 b, char3 c);
	uchar3 __ovld __cnfn mad_hi(uchar3 a, uchar3 b, uchar3 c);
	char4 __ovld __cnfn mad_hi(char4 a, char4 b, char4 c);
	uchar4 __ovld __cnfn mad_hi(uchar4 a, uchar4 b, uchar4 c);
	char8 __ovld __cnfn mad_hi(char8 a, char8 b, char8 c);
	uchar8 __ovld __cnfn mad_hi(uchar8 a, uchar8 b, uchar8 c);
	char16 __ovld __cnfn mad_hi(char16 a, char16 b, char16 c);
	uchar16 __ovld __cnfn mad_hi(uchar16 a, uchar16 b, uchar16 c);
	short __ovld __cnfn mad_hi(short a, short b, short c);
	ushort __ovld __cnfn mad_hi(ushort a, ushort b, ushort c);
	short2 __ovld __cnfn mad_hi(short2 a, short2 b, short2 c);
	ushort2 __ovld __cnfn mad_hi(ushort2 a, ushort2 b, ushort2 c);
	short3 __ovld __cnfn mad_hi(short3 a, short3 b, short3 c);
	ushort3 __ovld __cnfn mad_hi(ushort3 a, ushort3 b, ushort3 c);
	short4 __ovld __cnfn mad_hi(short4 a, short4 b, short4 c);
	ushort4 __ovld __cnfn mad_hi(ushort4 a, ushort4 b, ushort4 c);
	short8 __ovld __cnfn mad_hi(short8 a, short8 b, short8 c);
	ushort8 __ovld __cnfn mad_hi(ushort8 a, ushort8 b, ushort8 c);
	short16 __ovld __cnfn mad_hi(short16 a, short16 b, short16 c);
	ushort16 __ovld __cnfn mad_hi(ushort16 a, ushort16 b, ushort16 c);
	int __ovld __cnfn mad_hi(int a, int b, int c);
	uint __ovld __cnfn mad_hi(uint a, uint b, uint c);
	int2 __ovld __cnfn mad_hi(int2 a, int2 b, int2 c);
	uint2 __ovld __cnfn mad_hi(uint2 a, uint2 b, uint2 c);
	int3 __ovld __cnfn mad_hi(int3 a, int3 b, int3 c);
	uint3 __ovld __cnfn mad_hi(uint3 a, uint3 b, uint3 c);
	int4 __ovld __cnfn mad_hi(int4 a, int4 b, int4 c);
	uint4 __ovld __cnfn mad_hi(uint4 a, uint4 b, uint4 c);
	int8 __ovld __cnfn mad_hi(int8 a, int8 b, int8 c);
	uint8 __ovld __cnfn mad_hi(uint8 a, uint8 b, uint8 c);
	int16 __ovld __cnfn mad_hi(int16 a, int16 b, int16 c);
	uint16 __ovld __cnfn mad_hi(uint16 a, uint16 b, uint16 c);
	long __ovld __cnfn mad_hi(long a, long b, long c);
	ulong __ovld __cnfn mad_hi(ulong a, ulong b, ulong c);
	long2 __ovld __cnfn mad_hi(long2 a, long2 b, long2 c);
	ulong2 __ovld __cnfn mad_hi(ulong2 a, ulong2 b, ulong2 c);
	long3 __ovld __cnfn mad_hi(long3 a, long3 b, long3 c);
	ulong3 __ovld __cnfn mad_hi(ulong3 a, ulong3 b, ulong3 c);
	long4 __ovld __cnfn mad_hi(long4 a, long4 b, long4 c);
	ulong4 __ovld __cnfn mad_hi(ulong4 a, ulong4 b, ulong4 c);
	long8 __ovld __cnfn mad_hi(long8 a, long8 b, long8 c);
	ulong8 __ovld __cnfn mad_hi(ulong8 a, ulong8 b, ulong8 c);
	long16 __ovld __cnfn mad_hi(long16 a, long16 b, long16 c);
	ulong16 __ovld __cnfn mad_hi(ulong16 a, ulong16 b, ulong16 c);

	/**
	* Returns a * b + c and saturates the result.
	*/
	char __ovld __cnfn mad_sat(char a, char b, char c);
	uchar __ovld __cnfn mad_sat(uchar a, uchar b, uchar c);
	char2 __ovld __cnfn mad_sat(char2 a, char2 b, char2 c);
	uchar2 __ovld __cnfn mad_sat(uchar2 a, uchar2 b, uchar2 c);
	char3 __ovld __cnfn mad_sat(char3 a, char3 b, char3 c);
	uchar3 __ovld __cnfn mad_sat(uchar3 a, uchar3 b, uchar3 c);
	char4 __ovld __cnfn mad_sat(char4 a, char4 b, char4 c);
	uchar4 __ovld __cnfn mad_sat(uchar4 a, uchar4 b, uchar4 c);
	char8 __ovld __cnfn mad_sat(char8 a, char8 b, char8 c);
	uchar8 __ovld __cnfn mad_sat(uchar8 a, uchar8 b, uchar8 c);
	char16 __ovld __cnfn mad_sat(char16 a, char16 b, char16 c);
	uchar16 __ovld __cnfn mad_sat(uchar16 a, uchar16 b, uchar16 c);
	short __ovld __cnfn mad_sat(short a, short b, short c);
	ushort __ovld __cnfn mad_sat(ushort a, ushort b, ushort c);
	short2 __ovld __cnfn mad_sat(short2 a, short2 b, short2 c);
	ushort2 __ovld __cnfn mad_sat(ushort2 a, ushort2 b, ushort2 c);
	short3 __ovld __cnfn mad_sat(short3 a, short3 b, short3 c);
	ushort3 __ovld __cnfn mad_sat(ushort3 a, ushort3 b, ushort3 c);
	short4 __ovld __cnfn mad_sat(short4 a, short4 b, short4 c);
	ushort4 __ovld __cnfn mad_sat(ushort4 a, ushort4 b, ushort4 c);
	short8 __ovld __cnfn mad_sat(short8 a, short8 b, short8 c);
	ushort8 __ovld __cnfn mad_sat(ushort8 a, ushort8 b, ushort8 c);
	short16 __ovld __cnfn mad_sat(short16 a, short16 b, short16 c);
	ushort16 __ovld __cnfn mad_sat(ushort16 a, ushort16 b, ushort16 c);
	int __ovld __cnfn mad_sat(int a, int b, int c);
	uint __ovld __cnfn mad_sat(uint a, uint b, uint c);
	int2 __ovld __cnfn mad_sat(int2 a, int2 b, int2 c);
	uint2 __ovld __cnfn mad_sat(uint2 a, uint2 b, uint2 c);
	int3 __ovld __cnfn mad_sat(int3 a, int3 b, int3 c);
	uint3 __ovld __cnfn mad_sat(uint3 a, uint3 b, uint3 c);
	int4 __ovld __cnfn mad_sat(int4 a, int4 b, int4 c);
	uint4 __ovld __cnfn mad_sat(uint4 a, uint4 b, uint4 c);
	int8 __ovld __cnfn mad_sat(int8 a, int8 b, int8 c);
	uint8 __ovld __cnfn mad_sat(uint8 a, uint8 b, uint8 c);
	int16 __ovld __cnfn mad_sat(int16 a, int16 b, int16 c);
	uint16 __ovld __cnfn mad_sat(uint16 a, uint16 b, uint16 c);
	long __ovld __cnfn mad_sat(long a, long b, long c);
	ulong __ovld __cnfn mad_sat(ulong a, ulong b, ulong c);
	long2 __ovld __cnfn mad_sat(long2 a, long2 b, long2 c);
	ulong2 __ovld __cnfn mad_sat(ulong2 a, ulong2 b, ulong2 c);
	long3 __ovld __cnfn mad_sat(long3 a, long3 b, long3 c);
	ulong3 __ovld __cnfn mad_sat(ulong3 a, ulong3 b, ulong3 c);
	long4 __ovld __cnfn mad_sat(long4 a, long4 b, long4 c);
	ulong4 __ovld __cnfn mad_sat(ulong4 a, ulong4 b, ulong4 c);
	long8 __ovld __cnfn mad_sat(long8 a, long8 b, long8 c);
	ulong8 __ovld __cnfn mad_sat(ulong8 a, ulong8 b, ulong8 c);
	long16 __ovld __cnfn mad_sat(long16 a, long16 b, long16 c);
	ulong16 __ovld __cnfn mad_sat(ulong16 a, ulong16 b, ulong16 c);

	/**
	* Returns y if x < y, otherwise it returns x.
	*/
	char __ovld __cnfn max(char x, char y);
	uchar __ovld __cnfn max(uchar x, uchar y);
	char2 __ovld __cnfn max(char2 x, char2 y);
	uchar2 __ovld __cnfn max(uchar2 x, uchar2 y);
	char3 __ovld __cnfn max(char3 x, char3 y);
	uchar3 __ovld __cnfn max(uchar3 x, uchar3 y);
	char4 __ovld __cnfn max(char4 x, char4 y);
	uchar4 __ovld __cnfn max(uchar4 x, uchar4 y);
	char8 __ovld __cnfn max(char8 x, char8 y);
	uchar8 __ovld __cnfn max(uchar8 x, uchar8 y);
	char16 __ovld __cnfn max(char16 x, char16 y);
	uchar16 __ovld __cnfn max(uchar16 x, uchar16 y);
	short __ovld __cnfn max(short x, short y);
	ushort __ovld __cnfn max(ushort x, ushort y);
	short2 __ovld __cnfn max(short2 x, short2 y);
	ushort2 __ovld __cnfn max(ushort2 x, ushort2 y);
	short3 __ovld __cnfn max(short3 x, short3 y);
	ushort3 __ovld __cnfn max(ushort3 x, ushort3 y);
	short4 __ovld __cnfn max(short4 x, short4 y);
	ushort4 __ovld __cnfn max(ushort4 x, ushort4 y);
	short8 __ovld __cnfn max(short8 x, short8 y);
	ushort8 __ovld __cnfn max(ushort8 x, ushort8 y);
	short16 __ovld __cnfn max(short16 x, short16 y);
	ushort16 __ovld __cnfn max(ushort16 x, ushort16 y);
	int __ovld __cnfn max(int x, int y);
	uint __ovld __cnfn max(uint x, uint y);
	int2 __ovld __cnfn max(int2 x, int2 y);
	uint2 __ovld __cnfn max(uint2 x, uint2 y);
	int3 __ovld __cnfn max(int3 x, int3 y);
	uint3 __ovld __cnfn max(uint3 x, uint3 y);
	int4 __ovld __cnfn max(int4 x, int4 y);
	uint4 __ovld __cnfn max(uint4 x, uint4 y);
	int8 __ovld __cnfn max(int8 x, int8 y);
	uint8 __ovld __cnfn max(uint8 x, uint8 y);
	int16 __ovld __cnfn max(int16 x, int16 y);
	uint16 __ovld __cnfn max(uint16 x, uint16 y);
	long __ovld __cnfn max(long x, long y);
	ulong __ovld __cnfn max(ulong x, ulong y);
	long2 __ovld __cnfn max(long2 x, long2 y);
	ulong2 __ovld __cnfn max(ulong2 x, ulong2 y);
	long3 __ovld __cnfn max(long3 x, long3 y);
	ulong3 __ovld __cnfn max(ulong3 x, ulong3 y);
	long4 __ovld __cnfn max(long4 x, long4 y);
	ulong4 __ovld __cnfn max(ulong4 x, ulong4 y);
	long8 __ovld __cnfn max(long8 x, long8 y);
	ulong8 __ovld __cnfn max(ulong8 x, ulong8 y);
	long16 __ovld __cnfn max(long16 x, long16 y);
	ulong16 __ovld __cnfn max(ulong16 x, ulong16 y);
	char2 __ovld __cnfn max(char2 x, char y);
	uchar2 __ovld __cnfn max(uchar2 x, uchar y);
	char3 __ovld __cnfn max(char3 x, char y);
	uchar3 __ovld __cnfn max(uchar3 x, uchar y);
	char4 __ovld __cnfn max(char4 x, char y);
	uchar4 __ovld __cnfn max(uchar4 x, uchar y);
	char8 __ovld __cnfn max(char8 x, char y);
	uchar8 __ovld __cnfn max(uchar8 x, uchar y);
	char16 __ovld __cnfn max(char16 x, char y);
	uchar16 __ovld __cnfn max(uchar16 x, uchar y);
	short2 __ovld __cnfn max(short2 x, short y);
	ushort2 __ovld __cnfn max(ushort2 x, ushort y);
	short3 __ovld __cnfn max(short3 x, short y);
	ushort3 __ovld __cnfn max(ushort3 x, ushort y);
	short4 __ovld __cnfn max(short4 x, short y);
	ushort4 __ovld __cnfn max(ushort4 x, ushort y);
	short8 __ovld __cnfn max(short8 x, short y);
	ushort8 __ovld __cnfn max(ushort8 x, ushort y);
	short16 __ovld __cnfn max(short16 x, short y);
	ushort16 __ovld __cnfn max(ushort16 x, ushort y);
	int2 __ovld __cnfn max(int2 x, int y);
	uint2 __ovld __cnfn max(uint2 x, uint y);
	int3 __ovld __cnfn max(int3 x, int y);
	uint3 __ovld __cnfn max(uint3 x, uint y);
	int4 __ovld __cnfn max(int4 x, int y);
	uint4 __ovld __cnfn max(uint4 x, uint y);
	int8 __ovld __cnfn max(int8 x, int y);
	uint8 __ovld __cnfn max(uint8 x, uint y);
	int16 __ovld __cnfn max(int16 x, int y);
	uint16 __ovld __cnfn max(uint16 x, uint y);
	long2 __ovld __cnfn max(long2 x, long y);
	ulong2 __ovld __cnfn max(ulong2 x, ulong y);
	long3 __ovld __cnfn max(long3 x, long y);
	ulong3 __ovld __cnfn max(ulong3 x, ulong y);
	long4 __ovld __cnfn max(long4 x, long y);
	ulong4 __ovld __cnfn max(ulong4 x, ulong y);
	long8 __ovld __cnfn max(long8 x, long y);
	ulong8 __ovld __cnfn max(ulong8 x, ulong y);
	long16 __ovld __cnfn max(long16 x, long y);
	ulong16 __ovld __cnfn max(ulong16 x, ulong y);

	/**
	* Returns y if y < x, otherwise it returns x.
	*/
	char __ovld __cnfn min(char x, char y);
	uchar __ovld __cnfn min(uchar x, uchar y);
	char2 __ovld __cnfn min(char2 x, char2 y);
	uchar2 __ovld __cnfn min(uchar2 x, uchar2 y);
	char3 __ovld __cnfn min(char3 x, char3 y);
	uchar3 __ovld __cnfn min(uchar3 x, uchar3 y);
	char4 __ovld __cnfn min(char4 x, char4 y);
	uchar4 __ovld __cnfn min(uchar4 x, uchar4 y);
	char8 __ovld __cnfn min(char8 x, char8 y);
	uchar8 __ovld __cnfn min(uchar8 x, uchar8 y);
	char16 __ovld __cnfn min(char16 x, char16 y);
	uchar16 __ovld __cnfn min(uchar16 x, uchar16 y);
	short __ovld __cnfn min(short x, short y);
	ushort __ovld __cnfn min(ushort x, ushort y);
	short2 __ovld __cnfn min(short2 x, short2 y);
	ushort2 __ovld __cnfn min(ushort2 x, ushort2 y);
	short3 __ovld __cnfn min(short3 x, short3 y);
	ushort3 __ovld __cnfn min(ushort3 x, ushort3 y);
	short4 __ovld __cnfn min(short4 x, short4 y);
	ushort4 __ovld __cnfn min(ushort4 x, ushort4 y);
	short8 __ovld __cnfn min(short8 x, short8 y);
	ushort8 __ovld __cnfn min(ushort8 x, ushort8 y);
	short16 __ovld __cnfn min(short16 x, short16 y);
	ushort16 __ovld __cnfn min(ushort16 x, ushort16 y);
	int __ovld __cnfn min(int x, int y);
	uint __ovld __cnfn min(uint x, uint y);
	int2 __ovld __cnfn min(int2 x, int2 y);
	uint2 __ovld __cnfn min(uint2 x, uint2 y);
	int3 __ovld __cnfn min(int3 x, int3 y);
	uint3 __ovld __cnfn min(uint3 x, uint3 y);
	int4 __ovld __cnfn min(int4 x, int4 y);
	uint4 __ovld __cnfn min(uint4 x, uint4 y);
	int8 __ovld __cnfn min(int8 x, int8 y);
	uint8 __ovld __cnfn min(uint8 x, uint8 y);
	int16 __ovld __cnfn min(int16 x, int16 y);
	uint16 __ovld __cnfn min(uint16 x, uint16 y);
	long __ovld __cnfn min(long x, long y);
	ulong __ovld __cnfn min(ulong x, ulong y);
	long2 __ovld __cnfn min(long2 x, long2 y);
	ulong2 __ovld __cnfn min(ulong2 x, ulong2 y);
	long3 __ovld __cnfn min(long3 x, long3 y);
	ulong3 __ovld __cnfn min(ulong3 x, ulong3 y);
	long4 __ovld __cnfn min(long4 x, long4 y);
	ulong4 __ovld __cnfn min(ulong4 x, ulong4 y);
	long8 __ovld __cnfn min(long8 x, long8 y);
	ulong8 __ovld __cnfn min(ulong8 x, ulong8 y);
	long16 __ovld __cnfn min(long16 x, long16 y);
	ulong16 __ovld __cnfn min(ulong16 x, ulong16 y);
	char2 __ovld __cnfn min(char2 x, char y);
	uchar2 __ovld __cnfn min(uchar2 x, uchar y);
	char3 __ovld __cnfn min(char3 x, char y);
	uchar3 __ovld __cnfn min(uchar3 x, uchar y);
	char4 __ovld __cnfn min(char4 x, char y);
	uchar4 __ovld __cnfn min(uchar4 x, uchar y);
	char8 __ovld __cnfn min(char8 x, char y);
	uchar8 __ovld __cnfn min(uchar8 x, uchar y);
	char16 __ovld __cnfn min(char16 x, char y);
	uchar16 __ovld __cnfn min(uchar16 x, uchar y);
	short2 __ovld __cnfn min(short2 x, short y);
	ushort2 __ovld __cnfn min(ushort2 x, ushort y);
	short3 __ovld __cnfn min(short3 x, short y);
	ushort3 __ovld __cnfn min(ushort3 x, ushort y);
	short4 __ovld __cnfn min(short4 x, short y);
	ushort4 __ovld __cnfn min(ushort4 x, ushort y);
	short8 __ovld __cnfn min(short8 x, short y);
	ushort8 __ovld __cnfn min(ushort8 x, ushort y);
	short16 __ovld __cnfn min(short16 x, short y);
	ushort16 __ovld __cnfn min(ushort16 x, ushort y);
	int2 __ovld __cnfn min(int2 x, int y);
	uint2 __ovld __cnfn min(uint2 x, uint y);
	int3 __ovld __cnfn min(int3 x, int y);
	uint3 __ovld __cnfn min(uint3 x, uint y);
	int4 __ovld __cnfn min(int4 x, int y);
	uint4 __ovld __cnfn min(uint4 x, uint y);
	int8 __ovld __cnfn min(int8 x, int y);
	uint8 __ovld __cnfn min(uint8 x, uint y);
	int16 __ovld __cnfn min(int16 x, int y);
	uint16 __ovld __cnfn min(uint16 x, uint y);
	long2 __ovld __cnfn min(long2 x, long y);
	ulong2 __ovld __cnfn min(ulong2 x, ulong y);
	long3 __ovld __cnfn min(long3 x, long y);
	ulong3 __ovld __cnfn min(ulong3 x, ulong y);
	long4 __ovld __cnfn min(long4 x, long y);
	ulong4 __ovld __cnfn min(ulong4 x, ulong y);
	long8 __ovld __cnfn min(long8 x, long y);
	ulong8 __ovld __cnfn min(ulong8 x, ulong y);
	long16 __ovld __cnfn min(long16 x, long y);
	ulong16 __ovld __cnfn min(ulong16 x, ulong y);

	/**
	* Computes x * y and returns the high half of the
	* product of x and y.
	*/
	char __ovld __cnfn mul_hi(char x, char y);
	uchar __ovld __cnfn mul_hi(uchar x, uchar y);
	char2 __ovld __cnfn mul_hi(char2 x, char2 y);
	uchar2 __ovld __cnfn mul_hi(uchar2 x, uchar2 y);
	char3 __ovld __cnfn mul_hi(char3 x, char3 y);
	uchar3 __ovld __cnfn mul_hi(uchar3 x, uchar3 y);
	char4 __ovld __cnfn mul_hi(char4 x, char4 y);
	uchar4 __ovld __cnfn mul_hi(uchar4 x, uchar4 y);
	char8 __ovld __cnfn mul_hi(char8 x, char8 y);
	uchar8 __ovld __cnfn mul_hi(uchar8 x, uchar8 y);
	char16 __ovld __cnfn mul_hi(char16 x, char16 y);
	uchar16 __ovld __cnfn mul_hi(uchar16 x, uchar16 y);
	short __ovld __cnfn mul_hi(short x, short y);
	ushort __ovld __cnfn mul_hi(ushort x, ushort y);
	short2 __ovld __cnfn mul_hi(short2 x, short2 y);
	ushort2 __ovld __cnfn mul_hi(ushort2 x, ushort2 y);
	short3 __ovld __cnfn mul_hi(short3 x, short3 y);
	ushort3 __ovld __cnfn mul_hi(ushort3 x, ushort3 y);
	short4 __ovld __cnfn mul_hi(short4 x, short4 y);
	ushort4 __ovld __cnfn mul_hi(ushort4 x, ushort4 y);
	short8 __ovld __cnfn mul_hi(short8 x, short8 y);
	ushort8 __ovld __cnfn mul_hi(ushort8 x, ushort8 y);
	short16 __ovld __cnfn mul_hi(short16 x, short16 y);
	ushort16 __ovld __cnfn mul_hi(ushort16 x, ushort16 y);
	int __ovld __cnfn mul_hi(int x, int y);
	uint __ovld __cnfn mul_hi(uint x, uint y);
	int2 __ovld __cnfn mul_hi(int2 x, int2 y);
	uint2 __ovld __cnfn mul_hi(uint2 x, uint2 y);
	int3 __ovld __cnfn mul_hi(int3 x, int3 y);
	uint3 __ovld __cnfn mul_hi(uint3 x, uint3 y);
	int4 __ovld __cnfn mul_hi(int4 x, int4 y);
	uint4 __ovld __cnfn mul_hi(uint4 x, uint4 y);
	int8 __ovld __cnfn mul_hi(int8 x, int8 y);
	uint8 __ovld __cnfn mul_hi(uint8 x, uint8 y);
	int16 __ovld __cnfn mul_hi(int16 x, int16 y);
	uint16 __ovld __cnfn mul_hi(uint16 x, uint16 y);
	long __ovld __cnfn mul_hi(long x, long y);
	ulong __ovld __cnfn mul_hi(ulong x, ulong y);
	long2 __ovld __cnfn mul_hi(long2 x, long2 y);
	ulong2 __ovld __cnfn mul_hi(ulong2 x, ulong2 y);
	long3 __ovld __cnfn mul_hi(long3 x, long3 y);
	ulong3 __ovld __cnfn mul_hi(ulong3 x, ulong3 y);
	long4 __ovld __cnfn mul_hi(long4 x, long4 y);
	ulong4 __ovld __cnfn mul_hi(ulong4 x, ulong4 y);
	long8 __ovld __cnfn mul_hi(long8 x, long8 y);
	ulong8 __ovld __cnfn mul_hi(ulong8 x, ulong8 y);
	long16 __ovld __cnfn mul_hi(long16 x, long16 y);
	ulong16 __ovld __cnfn mul_hi(ulong16 x, ulong16 y);

	/**
	* For each element in v, the bits are shifted left by
	* the number of bits given by the corresponding
	* element in i (subject to usual shift modulo rules
	* described in section 6.3). Bits shifted off the left
	* side of the element are shifted back in from the
	* right.
	*/
	char __ovld __cnfn rotate(char v, char i);
	uchar __ovld __cnfn rotate(uchar v, uchar i);
	char2 __ovld __cnfn rotate(char2 v, char2 i);
	uchar2 __ovld __cnfn rotate(uchar2 v, uchar2 i);
	char3 __ovld __cnfn rotate(char3 v, char3 i);
	uchar3 __ovld __cnfn rotate(uchar3 v, uchar3 i);
	char4 __ovld __cnfn rotate(char4 v, char4 i);
	uchar4 __ovld __cnfn rotate(uchar4 v, uchar4 i);
	char8 __ovld __cnfn rotate(char8 v, char8 i);
	uchar8 __ovld __cnfn rotate(uchar8 v, uchar8 i);
	char16 __ovld __cnfn rotate(char16 v, char16 i);
	uchar16 __ovld __cnfn rotate(uchar16 v, uchar16 i);
	short __ovld __cnfn rotate(short v, short i);
	ushort __ovld __cnfn rotate(ushort v, ushort i);
	short2 __ovld __cnfn rotate(short2 v, short2 i);
	ushort2 __ovld __cnfn rotate(ushort2 v, ushort2 i);
	short3 __ovld __cnfn rotate(short3 v, short3 i);
	ushort3 __ovld __cnfn rotate(ushort3 v, ushort3 i);
	short4 __ovld __cnfn rotate(short4 v, short4 i);
	ushort4 __ovld __cnfn rotate(ushort4 v, ushort4 i);
	short8 __ovld __cnfn rotate(short8 v, short8 i);
	ushort8 __ovld __cnfn rotate(ushort8 v, ushort8 i);
	short16 __ovld __cnfn rotate(short16 v, short16 i);
	ushort16 __ovld __cnfn rotate(ushort16 v, ushort16 i);
	int __ovld __cnfn rotate(int v, int i);
	uint __ovld __cnfn rotate(uint v, uint i);
	int2 __ovld __cnfn rotate(int2 v, int2 i);
	uint2 __ovld __cnfn rotate(uint2 v, uint2 i);
	int3 __ovld __cnfn rotate(int3 v, int3 i);
	uint3 __ovld __cnfn rotate(uint3 v, uint3 i);
	int4 __ovld __cnfn rotate(int4 v, int4 i);
	uint4 __ovld __cnfn rotate(uint4 v, uint4 i);
	int8 __ovld __cnfn rotate(int8 v, int8 i);
	uint8 __ovld __cnfn rotate(uint8 v, uint8 i);
	int16 __ovld __cnfn rotate(int16 v, int16 i);
	uint16 __ovld __cnfn rotate(uint16 v, uint16 i);
	long __ovld __cnfn rotate(long v, long i);
	ulong __ovld __cnfn rotate(ulong v, ulong i);
	long2 __ovld __cnfn rotate(long2 v, long2 i);
	ulong2 __ovld __cnfn rotate(ulong2 v, ulong2 i);
	long3 __ovld __cnfn rotate(long3 v, long3 i);
	ulong3 __ovld __cnfn rotate(ulong3 v, ulong3 i);
	long4 __ovld __cnfn rotate(long4 v, long4 i);
	ulong4 __ovld __cnfn rotate(ulong4 v, ulong4 i);
	long8 __ovld __cnfn rotate(long8 v, long8 i);
	ulong8 __ovld __cnfn rotate(ulong8 v, ulong8 i);
	long16 __ovld __cnfn rotate(long16 v, long16 i);
	ulong16 __ovld __cnfn rotate(ulong16 v, ulong16 i);

	/**
	* Returns x - y and saturates the result.
	*/
	char __ovld __cnfn sub_sat(char x, char y);
	uchar __ovld __cnfn sub_sat(uchar x, uchar y);
	char2 __ovld __cnfn sub_sat(char2 x, char2 y);
	uchar2 __ovld __cnfn sub_sat(uchar2 x, uchar2 y);
	char3 __ovld __cnfn sub_sat(char3 x, char3 y);
	uchar3 __ovld __cnfn sub_sat(uchar3 x, uchar3 y);
	char4 __ovld __cnfn sub_sat(char4 x, char4 y);
	uchar4 __ovld __cnfn sub_sat(uchar4 x, uchar4 y);
	char8 __ovld __cnfn sub_sat(char8 x, char8 y);
	uchar8 __ovld __cnfn sub_sat(uchar8 x, uchar8 y);
	char16 __ovld __cnfn sub_sat(char16 x, char16 y);
	uchar16 __ovld __cnfn sub_sat(uchar16 x, uchar16 y);
	short __ovld __cnfn sub_sat(short x, short y);
	ushort __ovld __cnfn sub_sat(ushort x, ushort y);
	short2 __ovld __cnfn sub_sat(short2 x, short2 y);
	ushort2 __ovld __cnfn sub_sat(ushort2 x, ushort2 y);
	short3 __ovld __cnfn sub_sat(short3 x, short3 y);
	ushort3 __ovld __cnfn sub_sat(ushort3 x, ushort3 y);
	short4 __ovld __cnfn sub_sat(short4 x, short4 y);
	ushort4 __ovld __cnfn sub_sat(ushort4 x, ushort4 y);
	short8 __ovld __cnfn sub_sat(short8 x, short8 y);
	ushort8 __ovld __cnfn sub_sat(ushort8 x, ushort8 y);
	short16 __ovld __cnfn sub_sat(short16 x, short16 y);
	ushort16 __ovld __cnfn sub_sat(ushort16 x, ushort16 y);
	int __ovld __cnfn sub_sat(int x, int y);
	uint __ovld __cnfn sub_sat(uint x, uint y);
	int2 __ovld __cnfn sub_sat(int2 x, int2 y);
	uint2 __ovld __cnfn sub_sat(uint2 x, uint2 y);
	int3 __ovld __cnfn sub_sat(int3 x, int3 y);
	uint3 __ovld __cnfn sub_sat(uint3 x, uint3 y);
	int4 __ovld __cnfn sub_sat(int4 x, int4 y);
	uint4 __ovld __cnfn sub_sat(uint4 x, uint4 y);
	int8 __ovld __cnfn sub_sat(int8 x, int8 y);
	uint8 __ovld __cnfn sub_sat(uint8 x, uint8 y);
	int16 __ovld __cnfn sub_sat(int16 x, int16 y);
	uint16 __ovld __cnfn sub_sat(uint16 x, uint16 y);
	long __ovld __cnfn sub_sat(long x, long y);
	ulong __ovld __cnfn sub_sat(ulong x, ulong y);
	long2 __ovld __cnfn sub_sat(long2 x, long2 y);
	ulong2 __ovld __cnfn sub_sat(ulong2 x, ulong2 y);
	long3 __ovld __cnfn sub_sat(long3 x, long3 y);
	ulong3 __ovld __cnfn sub_sat(ulong3 x, ulong3 y);
	long4 __ovld __cnfn sub_sat(long4 x, long4 y);
	ulong4 __ovld __cnfn sub_sat(ulong4 x, ulong4 y);
	long8 __ovld __cnfn sub_sat(long8 x, long8 y);
	ulong8 __ovld __cnfn sub_sat(ulong8 x, ulong8 y);
	long16 __ovld __cnfn sub_sat(long16 x, long16 y);
	ulong16 __ovld __cnfn sub_sat(ulong16 x, ulong16 y);

	/**
	* result[i] = ((short)hi[i] << 8) \| lo[i]
	* result[i] = ((ushort)hi[i] << 8) \| lo[i]
	*/
	short __ovld __cnfn upsample(char hi, uchar lo);
	ushort __ovld __cnfn upsample(uchar hi, uchar lo);
	short2 __ovld __cnfn upsample(char2 hi, uchar2 lo);
	short3 __ovld __cnfn upsample(char3 hi, uchar3 lo);
	short4 __ovld __cnfn upsample(char4 hi, uchar4 lo);
	short8 __ovld __cnfn upsample(char8 hi, uchar8 lo);
	short16 __ovld __cnfn upsample(char16 hi, uchar16 lo);
	ushort2 __ovld __cnfn upsample(uchar2 hi, uchar2 lo);
	ushort3 __ovld __cnfn upsample(uchar3 hi, uchar3 lo);
	ushort4 __ovld __cnfn upsample(uchar4 hi, uchar4 lo);
	ushort8 __ovld __cnfn upsample(uchar8 hi, uchar8 lo);
	ushort16 __ovld __cnfn upsample(uchar16 hi, uchar16 lo);

	/**
	* result[i] = ((int)hi[i] << 16) \| lo[i]
	* result[i] = ((uint)hi[i] << 16) \| lo[i]
	*/
	int __ovld __cnfn upsample(short hi, ushort lo);
	uint __ovld __cnfn upsample(ushort hi, ushort lo);
	int2 __ovld __cnfn upsample(short2 hi, ushort2 lo);
	int3 __ovld __cnfn upsample(short3 hi, ushort3 lo);
	int4 __ovld __cnfn upsample(short4 hi, ushort4 lo);
	int8 __ovld __cnfn upsample(short8 hi, ushort8 lo);
	int16 __ovld __cnfn upsample(short16 hi, ushort16 lo);
	uint2 __ovld __cnfn upsample(ushort2 hi, ushort2 lo);
	uint3 __ovld __cnfn upsample(ushort3 hi, ushort3 lo);
	uint4 __ovld __cnfn upsample(ushort4 hi, ushort4 lo);
	uint8 __ovld __cnfn upsample(ushort8 hi, ushort8 lo);
	uint16 __ovld __cnfn upsample(ushort16 hi, ushort16 lo);
	/**
	* result[i] = ((long)hi[i] << 32) \| lo[i]
	* result[i] = ((ulong)hi[i] << 32) \| lo[i]
	*/
	long __ovld __cnfn upsample(int hi, uint lo);
	ulong __ovld __cnfn upsample(uint hi, uint lo);
	long2 __ovld __cnfn upsample(int2 hi, uint2 lo);
	long3 __ovld __cnfn upsample(int3 hi, uint3 lo);
	long4 __ovld __cnfn upsample(int4 hi, uint4 lo);
	long8 __ovld __cnfn upsample(int8 hi, uint8 lo);
	long16 __ovld __cnfn upsample(int16 hi, uint16 lo);
	ulong2 __ovld __cnfn upsample(uint2 hi, uint2 lo);
	ulong3 __ovld __cnfn upsample(uint3 hi, uint3 lo);
	ulong4 __ovld __cnfn upsample(uint4 hi, uint4 lo);
	ulong8 __ovld __cnfn upsample(uint8 hi, uint8 lo);
	ulong16 __ovld __cnfn upsample(uint16 hi, uint16 lo);

	/*
	* popcount(x): returns the number of set bit in x
	*/
	char __ovld __cnfn popcount(char x);
	uchar __ovld __cnfn popcount(uchar x);
	char2 __ovld __cnfn popcount(char2 x);
	uchar2 __ovld __cnfn popcount(uchar2 x);
	char3 __ovld __cnfn popcount(char3 x);
	uchar3 __ovld __cnfn popcount(uchar3 x);
	char4 __ovld __cnfn popcount(char4 x);
	uchar4 __ovld __cnfn popcount(uchar4 x);
	char8 __ovld __cnfn popcount(char8 x);
	uchar8 __ovld __cnfn popcount(uchar8 x);
	char16 __ovld __cnfn popcount(char16 x);
	uchar16 __ovld __cnfn popcount(uchar16 x);
	short __ovld __cnfn popcount(short x);
	ushort __ovld __cnfn popcount(ushort x);
	short2 __ovld __cnfn popcount(short2 x);
	ushort2 __ovld __cnfn popcount(ushort2 x);
	short3 __ovld __cnfn popcount(short3 x);
	ushort3 __ovld __cnfn popcount(ushort3 x);
	short4 __ovld __cnfn popcount(short4 x);
	ushort4 __ovld __cnfn popcount(ushort4 x);
	short8 __ovld __cnfn popcount(short8 x);
	ushort8 __ovld __cnfn popcount(ushort8 x);
	short16 __ovld __cnfn popcount(short16 x);
	ushort16 __ovld __cnfn popcount(ushort16 x);
	int __ovld __cnfn popcount(int x);
	uint __ovld __cnfn popcount(uint x);
	int2 __ovld __cnfn popcount(int2 x);
	uint2 __ovld __cnfn popcount(uint2 x);
	int3 __ovld __cnfn popcount(int3 x);
	uint3 __ovld __cnfn popcount(uint3 x);
	int4 __ovld __cnfn popcount(int4 x);
	uint4 __ovld __cnfn popcount(uint4 x);
	int8 __ovld __cnfn popcount(int8 x);
	uint8 __ovld __cnfn popcount(uint8 x);
	int16 __ovld __cnfn popcount(int16 x);
	uint16 __ovld __cnfn popcount(uint16 x);
	long __ovld __cnfn popcount(long x);
	ulong __ovld __cnfn popcount(ulong x);
	long2 __ovld __cnfn popcount(long2 x);
	ulong2 __ovld __cnfn popcount(ulong2 x);
	long3 __ovld __cnfn popcount(long3 x);
	ulong3 __ovld __cnfn popcount(ulong3 x);
	long4 __ovld __cnfn popcount(long4 x);
	ulong4 __ovld __cnfn popcount(ulong4 x);
	long8 __ovld __cnfn popcount(long8 x);
	ulong8 __ovld __cnfn popcount(ulong8 x);
	long16 __ovld __cnfn popcount(long16 x);
	ulong16 __ovld __cnfn popcount(ulong16 x);

	/**
	* Multiply two 24-bit integer values x and y and add
	* the 32-bit integer result to the 32-bit integer z.
	* Refer to definition of mul24 to see how the 24-bit
	* integer multiplication is performed.
	*/
	int __ovld __cnfn mad24(int x, int y, int z);
	uint __ovld __cnfn mad24(uint x, uint y, uint z);
	int2 __ovld __cnfn mad24(int2 x, int2 y, int2 z);
	uint2 __ovld __cnfn mad24(uint2 x, uint2 y, uint2 z);
	int3 __ovld __cnfn mad24(int3 x, int3 y, int3 z);
	uint3 __ovld __cnfn mad24(uint3 x, uint3 y, uint3 z);
	int4 __ovld __cnfn mad24(int4 x, int4 y, int4 z);
	uint4 __ovld __cnfn mad24(uint4 x, uint4 y, uint4 z);
	int8 __ovld __cnfn mad24(int8 x, int8 y, int8 z);
	uint8 __ovld __cnfn mad24(uint8 x, uint8 y, uint8 z);
	int16 __ovld __cnfn mad24(int16 x, int16 y, int16 z);
	uint16 __ovld __cnfn mad24(uint16 x, uint16 y, uint16 z);

	/**
	* Multiply two 24-bit integer values x and y. x and y
	* are 32-bit integers but only the low 24-bits are used
	* to perform the multiplication. mul24 should only
	* be used when values in x and y are in the range [-
	* 2^23, 2^23-1] if x and y are signed integers and in the
	* range [0, 2^24-1] if x and y are unsigned integers. If
	* x and y are not in this range, the multiplication
	* result is implementation-defined.
	*/
	int __ovld __cnfn mul24(int x, int y);
	uint __ovld __cnfn mul24(uint x, uint y);
	int2 __ovld __cnfn mul24(int2 x, int2 y);
	uint2 __ovld __cnfn mul24(uint2 x, uint2 y);
	int3 __ovld __cnfn mul24(int3 x, int3 y);
	uint3 __ovld __cnfn mul24(uint3 x, uint3 y);
	int4 __ovld __cnfn mul24(int4 x, int4 y);
	uint4 __ovld __cnfn mul24(uint4 x, uint4 y);
	int8 __ovld __cnfn mul24(int8 x, int8 y);
	uint8 __ovld __cnfn mul24(uint8 x, uint8 y);
	int16 __ovld __cnfn mul24(int16 x, int16 y);
	uint16 __ovld __cnfn mul24(uint16 x, uint16 y);

	// OpenCL v1.1 s6.11.4, v1.2 s6.12.4, v2.0 s6.13.4 - Common Functions

	/**
	* Returns fmin(fmax(x, minval), maxval).
	* Results are undefined if minval > maxval.
	*/
	float __ovld __cnfn clamp(float x, float minval, float maxval);
	float2 __ovld __cnfn clamp(float2 x, float2 minval, float2 maxval);
	float3 __ovld __cnfn clamp(float3 x, float3 minval, float3 maxval);
	float4 __ovld __cnfn clamp(float4 x, float4 minval, float4 maxval);
	float8 __ovld __cnfn clamp(float8 x, float8 minval, float8 maxval);
	float16 __ovld __cnfn clamp(float16 x, float16 minval, float16 maxval);
	float2 __ovld __cnfn clamp(float2 x, float minval, float maxval);
	float3 __ovld __cnfn clamp(float3 x, float minval, float maxval);
	float4 __ovld __cnfn clamp(float4 x, float minval, float maxval);
	float8 __ovld __cnfn clamp(float8 x, float minval, float maxval);
	float16 __ovld __cnfn clamp(float16 x, float minval, float maxval);
	#ifdef cl_khr_fp64
	double __ovld __cnfn clamp(double x, double minval, double maxval);
	double2 __ovld __cnfn clamp(double2 x, double2 minval, double2 maxval);
	double3 __ovld __cnfn clamp(double3 x, double3 minval, double3 maxval);
	double4 __ovld __cnfn clamp(double4 x, double4 minval, double4 maxval);
	double8 __ovld __cnfn clamp(double8 x, double8 minval, double8 maxval);
	double16 __ovld __cnfn clamp(double16 x, double16 minval, double16 maxval);
	double2 __ovld __cnfn clamp(double2 x, double minval, double maxval);
	double3 __ovld __cnfn clamp(double3 x, double minval, double maxval);
	double4 __ovld __cnfn clamp(double4 x, double minval, double maxval);
	double8 __ovld __cnfn clamp(double8 x, double minval, double maxval);
	double16 __ovld __cnfn clamp(double16 x, double minval, double maxval);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn clamp(half x, half minval, half maxval);
	half2 __ovld __cnfn clamp(half2 x, half2 minval, half2 maxval);
	half3 __ovld __cnfn clamp(half3 x, half3 minval, half3 maxval);
	half4 __ovld __cnfn clamp(half4 x, half4 minval, half4 maxval);
	half8 __ovld __cnfn clamp(half8 x, half8 minval, half8 maxval);
	half16 __ovld __cnfn clamp(half16 x, half16 minval, half16 maxval);
	half2 __ovld __cnfn clamp(half2 x, half minval, half maxval);
	half3 __ovld __cnfn clamp(half3 x, half minval, half maxval);
	half4 __ovld __cnfn clamp(half4 x, half minval, half maxval);
	half8 __ovld __cnfn clamp(half8 x, half minval, half maxval);
	half16 __ovld __cnfn clamp(half16 x, half minval, half maxval);
	#endif //cl_khr_fp16

	/**
	* Converts radians to degrees, i.e. (180 / PI) *
	* radians.
	*/
	float __ovld __cnfn degrees(float radians);
	float2 __ovld __cnfn degrees(float2 radians);
	float3 __ovld __cnfn degrees(float3 radians);
	float4 __ovld __cnfn degrees(float4 radians);
	float8 __ovld __cnfn degrees(float8 radians);
	float16 __ovld __cnfn degrees(float16 radians);
	#ifdef cl_khr_fp64
	double __ovld __cnfn degrees(double radians);
	double2 __ovld __cnfn degrees(double2 radians);
	double3 __ovld __cnfn degrees(double3 radians);
	double4 __ovld __cnfn degrees(double4 radians);
	double8 __ovld __cnfn degrees(double8 radians);
	double16 __ovld __cnfn degrees(double16 radians);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn degrees(half radians);
	half2 __ovld __cnfn degrees(half2 radians);
	half3 __ovld __cnfn degrees(half3 radians);
	half4 __ovld __cnfn degrees(half4 radians);
	half8 __ovld __cnfn degrees(half8 radians);
	half16 __ovld __cnfn degrees(half16 radians);
	#endif //cl_khr_fp16

	/**
	* Returns y if x < y, otherwise it returns x. If x and y
	* are infinite or NaN, the return values are undefined.
	*/
	float __ovld __cnfn max(float x, float y);
	float2 __ovld __cnfn max(float2 x, float2 y);
	float3 __ovld __cnfn max(float3 x, float3 y);
	float4 __ovld __cnfn max(float4 x, float4 y);
	float8 __ovld __cnfn max(float8 x, float8 y);
	float16 __ovld __cnfn max(float16 x, float16 y);
	float2 __ovld __cnfn max(float2 x, float y);
	float3 __ovld __cnfn max(float3 x, float y);
	float4 __ovld __cnfn max(float4 x, float y);
	float8 __ovld __cnfn max(float8 x, float y);
	float16 __ovld __cnfn max(float16 x, float y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn max(double x, double y);
	double2 __ovld __cnfn max(double2 x, double2 y);
	double3 __ovld __cnfn max(double3 x, double3 y);
	double4 __ovld __cnfn max(double4 x, double4 y);
	double8 __ovld __cnfn max(double8 x, double8 y);
	double16 __ovld __cnfn max(double16 x, double16 y);
	double2 __ovld __cnfn max(double2 x, double y);
	double3 __ovld __cnfn max(double3 x, double y);
	double4 __ovld __cnfn max(double4 x, double y);
	double8 __ovld __cnfn max(double8 x, double y);
	double16 __ovld __cnfn max(double16 x, double y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn max(half x, half y);
	half2 __ovld __cnfn max(half2 x, half2 y);
	half3 __ovld __cnfn max(half3 x, half3 y);
	half4 __ovld __cnfn max(half4 x, half4 y);
	half8 __ovld __cnfn max(half8 x, half8 y);
	half16 __ovld __cnfn max(half16 x, half16 y);
	half2 __ovld __cnfn max(half2 x, half y);
	half3 __ovld __cnfn max(half3 x, half y);
	half4 __ovld __cnfn max(half4 x, half y);
	half8 __ovld __cnfn max(half8 x, half y);
	half16 __ovld __cnfn max(half16 x, half y);
	#endif //cl_khr_fp16

	/**
	* Returns y if y < x, otherwise it returns x. If x and y
	* are infinite or NaN, the return values are undefined.
	*/
	float __ovld __cnfn min(float x, float y);
	float2 __ovld __cnfn min(float2 x, float2 y);
	float3 __ovld __cnfn min(float3 x, float3 y);
	float4 __ovld __cnfn min(float4 x, float4 y);
	float8 __ovld __cnfn min(float8 x, float8 y);
	float16 __ovld __cnfn min(float16 x, float16 y);
	float2 __ovld __cnfn min(float2 x, float y);
	float3 __ovld __cnfn min(float3 x, float y);
	float4 __ovld __cnfn min(float4 x, float y);
	float8 __ovld __cnfn min(float8 x, float y);
	float16 __ovld __cnfn min(float16 x, float y);
	#ifdef cl_khr_fp64
	double __ovld __cnfn min(double x, double y);
	double2 __ovld __cnfn min(double2 x, double2 y);
	double3 __ovld __cnfn min(double3 x, double3 y);
	double4 __ovld __cnfn min(double4 x, double4 y);
	double8 __ovld __cnfn min(double8 x, double8 y);
	double16 __ovld __cnfn min(double16 x, double16 y);
	double2 __ovld __cnfn min(double2 x, double y);
	double3 __ovld __cnfn min(double3 x, double y);
	double4 __ovld __cnfn min(double4 x, double y);
	double8 __ovld __cnfn min(double8 x, double y);
	double16 __ovld __cnfn min(double16 x, double y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn min(half x, half y);
	half2 __ovld __cnfn min(half2 x, half2 y);
	half3 __ovld __cnfn min(half3 x, half3 y);
	half4 __ovld __cnfn min(half4 x, half4 y);
	half8 __ovld __cnfn min(half8 x, half8 y);
	half16 __ovld __cnfn min(half16 x, half16 y);
	half2 __ovld __cnfn min(half2 x, half y);
	half3 __ovld __cnfn min(half3 x, half y);
	half4 __ovld __cnfn min(half4 x, half y);
	half8 __ovld __cnfn min(half8 x, half y);
	half16 __ovld __cnfn min(half16 x, half y);
	#endif //cl_khr_fp16

	/**
	* Returns the linear blend of x & y implemented as:
	* x + (y - x) * a
	* a must be a value in the range 0.0 ... 1.0. If a is not
	* in the range 0.0 ... 1.0, the return values are
	* undefined.
	*/
	float __ovld __cnfn mix(float x, float y, float a);
	float2 __ovld __cnfn mix(float2 x, float2 y, float2 a);
	float3 __ovld __cnfn mix(float3 x, float3 y, float3 a);
	float4 __ovld __cnfn mix(float4 x, float4 y, float4 a);
	float8 __ovld __cnfn mix(float8 x, float8 y, float8 a);
	float16 __ovld __cnfn mix(float16 x, float16 y, float16 a);
	float2 __ovld __cnfn mix(float2 x, float2 y, float a);
	float3 __ovld __cnfn mix(float3 x, float3 y, float a);
	float4 __ovld __cnfn mix(float4 x, float4 y, float a);
	float8 __ovld __cnfn mix(float8 x, float8 y, float a);
	float16 __ovld __cnfn mix(float16 x, float16 y, float a);
	#ifdef cl_khr_fp64
	double __ovld __cnfn mix(double x, double y, double a);
	double2 __ovld __cnfn mix(double2 x, double2 y, double2 a);
	double3 __ovld __cnfn mix(double3 x, double3 y, double3 a);
	double4 __ovld __cnfn mix(double4 x, double4 y, double4 a);
	double8 __ovld __cnfn mix(double8 x, double8 y, double8 a);
	double16 __ovld __cnfn mix(double16 x, double16 y, double16 a);
	double2 __ovld __cnfn mix(double2 x, double2 y, double a);
	double3 __ovld __cnfn mix(double3 x, double3 y, double a);
	double4 __ovld __cnfn mix(double4 x, double4 y, double a);
	double8 __ovld __cnfn mix(double8 x, double8 y, double a);
	double16 __ovld __cnfn mix(double16 x, double16 y, double a);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn mix(half x, half y, half a);
	half2 __ovld __cnfn mix(half2 x, half2 y, half2 a);
	half3 __ovld __cnfn mix(half3 x, half3 y, half3 a);
	half4 __ovld __cnfn mix(half4 x, half4 y, half4 a);
	half8 __ovld __cnfn mix(half8 x, half8 y, half8 a);
	half16 __ovld __cnfn mix(half16 x, half16 y, half16 a);
	half2 __ovld __cnfn mix(half2 x, half2 y, half a);
	half3 __ovld __cnfn mix(half3 x, half3 y, half a);
	half4 __ovld __cnfn mix(half4 x, half4 y, half a);
	half8 __ovld __cnfn mix(half8 x, half8 y, half a);
	half16 __ovld __cnfn mix(half16 x, half16 y, half a);
	#endif //cl_khr_fp16

	/**
	* Converts degrees to radians, i.e. (PI / 180) *
	* degrees.
	*/
	float __ovld __cnfn radians(float degrees);
	float2 __ovld __cnfn radians(float2 degrees);
	float3 __ovld __cnfn radians(float3 degrees);
	float4 __ovld __cnfn radians(float4 degrees);
	float8 __ovld __cnfn radians(float8 degrees);
	float16 __ovld __cnfn radians(float16 degrees);
	#ifdef cl_khr_fp64
	double __ovld __cnfn radians(double degrees);
	double2 __ovld __cnfn radians(double2 degrees);
	double3 __ovld __cnfn radians(double3 degrees);
	double4 __ovld __cnfn radians(double4 degrees);
	double8 __ovld __cnfn radians(double8 degrees);
	double16 __ovld __cnfn radians(double16 degrees);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn radians(half degrees);
	half2 __ovld __cnfn radians(half2 degrees);
	half3 __ovld __cnfn radians(half3 degrees);
	half4 __ovld __cnfn radians(half4 degrees);
	half8 __ovld __cnfn radians(half8 degrees);
	half16 __ovld __cnfn radians(half16 degrees);
	#endif //cl_khr_fp16

	/**
	* Returns 0.0 if x < edge, otherwise it returns 1.0.
	*/
	float __ovld __cnfn step(float edge, float x);
	float2 __ovld __cnfn step(float2 edge, float2 x);
	float3 __ovld __cnfn step(float3 edge, float3 x);
	float4 __ovld __cnfn step(float4 edge, float4 x);
	float8 __ovld __cnfn step(float8 edge, float8 x);
	float16 __ovld __cnfn step(float16 edge, float16 x);
	float2 __ovld __cnfn step(float edge, float2 x);
	float3 __ovld __cnfn step(float edge, float3 x);
	float4 __ovld __cnfn step(float edge, float4 x);
	float8 __ovld __cnfn step(float edge, float8 x);
	float16 __ovld __cnfn step(float edge, float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn step(double edge, double x);
	double2 __ovld __cnfn step(double2 edge, double2 x);
	double3 __ovld __cnfn step(double3 edge, double3 x);
	double4 __ovld __cnfn step(double4 edge, double4 x);
	double8 __ovld __cnfn step(double8 edge, double8 x);
	double16 __ovld __cnfn step(double16 edge, double16 x);
	double2 __ovld __cnfn step(double edge, double2 x);
	double3 __ovld __cnfn step(double edge, double3 x);
	double4 __ovld __cnfn step(double edge, double4 x);
	double8 __ovld __cnfn step(double edge, double8 x);
	double16 __ovld __cnfn step(double edge, double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn step(half edge, half x);
	half2 __ovld __cnfn step(half2 edge, half2 x);
	half3 __ovld __cnfn step(half3 edge, half3 x);
	half4 __ovld __cnfn step(half4 edge, half4 x);
	half8 __ovld __cnfn step(half8 edge, half8 x);
	half16 __ovld __cnfn step(half16 edge, half16 x);
	half2 __ovld __cnfn step(half edge, half2 x);
	half3 __ovld __cnfn step(half edge, half3 x);
	half4 __ovld __cnfn step(half edge, half4 x);
	half8 __ovld __cnfn step(half edge, half8 x);
	half16 __ovld __cnfn step(half edge, half16 x);
	#endif //cl_khr_fp16

	/**
	* Returns 0.0 if x <= edge0 and 1.0 if x >= edge1 and
	* performs smooth Hermite interpolation between 0
	* and 1when edge0 < x < edge1. This is useful in
	* cases where you would want a threshold function
	* with a smooth transition.
	* This is equivalent to:
	* gentype t;
	* t = clamp ((x - edge0) / (edge1 - edge0), 0, 1);
	* return t * t * (3 - 2 * t);
	* Results are undefined if edge0 >= edge1 or if x,
	* edge0 or edge1 is a NaN.
	*/
	float __ovld __cnfn smoothstep(float edge0, float edge1, float x);
	float2 __ovld __cnfn smoothstep(float2 edge0, float2 edge1, float2 x);
	float3 __ovld __cnfn smoothstep(float3 edge0, float3 edge1, float3 x);
	float4 __ovld __cnfn smoothstep(float4 edge0, float4 edge1, float4 x);
	float8 __ovld __cnfn smoothstep(float8 edge0, float8 edge1, float8 x);
	float16 __ovld __cnfn smoothstep(float16 edge0, float16 edge1, float16 x);
	float2 __ovld __cnfn smoothstep(float edge0, float edge1, float2 x);
	float3 __ovld __cnfn smoothstep(float edge0, float edge1, float3 x);
	float4 __ovld __cnfn smoothstep(float edge0, float edge1, float4 x);
	float8 __ovld __cnfn smoothstep(float edge0, float edge1, float8 x);
	float16 __ovld __cnfn smoothstep(float edge0, float edge1, float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn smoothstep(double edge0, double edge1, double x);
	double2 __ovld __cnfn smoothstep(double2 edge0, double2 edge1, double2 x);
	double3 __ovld __cnfn smoothstep(double3 edge0, double3 edge1, double3 x);
	double4 __ovld __cnfn smoothstep(double4 edge0, double4 edge1, double4 x);
	double8 __ovld __cnfn smoothstep(double8 edge0, double8 edge1, double8 x);
	double16 __ovld __cnfn smoothstep(double16 edge0, double16 edge1, double16 x);
	double2 __ovld __cnfn smoothstep(double edge0, double edge1, double2 x);
	double3 __ovld __cnfn smoothstep(double edge0, double edge1, double3 x);
	double4 __ovld __cnfn smoothstep(double edge0, double edge1, double4 x);
	double8 __ovld __cnfn smoothstep(double edge0, double edge1, double8 x);
	double16 __ovld __cnfn smoothstep(double edge0, double edge1, double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
	half2 __ovld __cnfn smoothstep(half2 edge0, half2 edge1, half2 x);
	half3 __ovld __cnfn smoothstep(half3 edge0, half3 edge1, half3 x);
	half4 __ovld __cnfn smoothstep(half4 edge0, half4 edge1, half4 x);
	half8 __ovld __cnfn smoothstep(half8 edge0, half8 edge1, half8 x);
	half16 __ovld __cnfn smoothstep(half16 edge0, half16 edge1, half16 x);
	half2 __ovld __cnfn smoothstep(half edge0, half edge1, half2 x);
	half3 __ovld __cnfn smoothstep(half edge0, half edge1, half3 x);
	half4 __ovld __cnfn smoothstep(half edge0, half edge1, half4 x);
	half8 __ovld __cnfn smoothstep(half edge0, half edge1, half8 x);
	half16 __ovld __cnfn smoothstep(half edge0, half edge1, half16 x);
	#endif //cl_khr_fp16

	/**
	* Returns 1.0 if x > 0, -0.0 if x = -0.0, +0.0 if x =
	* +0.0, or -1.0 if x < 0. Returns 0.0 if x is a NaN.
	*/
	float __ovld __cnfn sign(float x);
	float2 __ovld __cnfn sign(float2 x);
	float3 __ovld __cnfn sign(float3 x);
	float4 __ovld __cnfn sign(float4 x);
	float8 __ovld __cnfn sign(float8 x);
	float16 __ovld __cnfn sign(float16 x);
	#ifdef cl_khr_fp64
	double __ovld __cnfn sign(double x);
	double2 __ovld __cnfn sign(double2 x);
	double3 __ovld __cnfn sign(double3 x);
	double4 __ovld __cnfn sign(double4 x);
	double8 __ovld __cnfn sign(double8 x);
	double16 __ovld __cnfn sign(double16 x);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn sign(half x);
	half2 __ovld __cnfn sign(half2 x);
	half3 __ovld __cnfn sign(half3 x);
	half4 __ovld __cnfn sign(half4 x);
	half8 __ovld __cnfn sign(half8 x);
	half16 __ovld __cnfn sign(half16 x);
	#endif //cl_khr_fp16

	// OpenCL v1.1 s6.11.5, v1.2 s6.12.5, v2.0 s6.13.5 - Geometric Functions

	/**
	* Returns the cross product of p0.xyz and p1.xyz. The
	* w component of float4 result returned will be 0.0.
	*/
	float4 __ovld __cnfn cross(float4 p0, float4 p1);
	float3 __ovld __cnfn cross(float3 p0, float3 p1);
	#ifdef cl_khr_fp64
	double4 __ovld __cnfn cross(double4 p0, double4 p1);
	double3 __ovld __cnfn cross(double3 p0, double3 p1);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half4 __ovld __cnfn cross(half4 p0, half4 p1);
	half3 __ovld __cnfn cross(half3 p0, half3 p1);
	#endif //cl_khr_fp16

	/**
	* Compute dot product.
	*/
	float __ovld __cnfn dot(float p0, float p1);
	float __ovld __cnfn dot(float2 p0, float2 p1);
	float __ovld __cnfn dot(float3 p0, float3 p1);
	float __ovld __cnfn dot(float4 p0, float4 p1);
	#ifdef cl_khr_fp64
	double __ovld __cnfn dot(double p0, double p1);
	double __ovld __cnfn dot(double2 p0, double2 p1);
	double __ovld __cnfn dot(double3 p0, double3 p1);
	double __ovld __cnfn dot(double4 p0, double4 p1);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn dot(half p0, half p1);
	half __ovld __cnfn dot(half2 p0, half2 p1);
	half __ovld __cnfn dot(half3 p0, half3 p1);
	half __ovld __cnfn dot(half4 p0, half4 p1);
	#endif //cl_khr_fp16

	/**
	* Returns the distance between p0 and p1. This is
	* calculated as length(p0 - p1).
	*/
	float __ovld __cnfn distance(float p0, float p1);
	float __ovld __cnfn distance(float2 p0, float2 p1);
	float __ovld __cnfn distance(float3 p0, float3 p1);
	float __ovld __cnfn distance(float4 p0, float4 p1);
	#ifdef cl_khr_fp64
	double __ovld __cnfn distance(double p0, double p1);
	double __ovld __cnfn distance(double2 p0, double2 p1);
	double __ovld __cnfn distance(double3 p0, double3 p1);
	double __ovld __cnfn distance(double4 p0, double4 p1);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn distance(half p0, half p1);
	half __ovld __cnfn distance(half2 p0, half2 p1);
	half __ovld __cnfn distance(half3 p0, half3 p1);
	half __ovld __cnfn distance(half4 p0, half4 p1);
	#endif //cl_khr_fp16

	/**
	* Return the length of vector p, i.e.,
	* sqrt(p.x2 + p.y 2 + ...)
	*/
	float __ovld __cnfn length(float p);
	float __ovld __cnfn length(float2 p);
	float __ovld __cnfn length(float3 p);
	float __ovld __cnfn length(float4 p);
	#ifdef cl_khr_fp64
	double __ovld __cnfn length(double p);
	double __ovld __cnfn length(double2 p);
	double __ovld __cnfn length(double3 p);
	double __ovld __cnfn length(double4 p);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn length(half p);
	half __ovld __cnfn length(half2 p);
	half __ovld __cnfn length(half3 p);
	half __ovld __cnfn length(half4 p);
	#endif //cl_khr_fp16

	/**
	* Returns a vector in the same direction as p but with a
	* length of 1.
	*/
	float __ovld __cnfn normalize(float p);
	float2 __ovld __cnfn normalize(float2 p);
	float3 __ovld __cnfn normalize(float3 p);
	float4 __ovld __cnfn normalize(float4 p);
	#ifdef cl_khr_fp64
	double __ovld __cnfn normalize(double p);
	double2 __ovld __cnfn normalize(double2 p);
	double3 __ovld __cnfn normalize(double3 p);
	double4 __ovld __cnfn normalize(double4 p);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn normalize(half p);
	half2 __ovld __cnfn normalize(half2 p);
	half3 __ovld __cnfn normalize(half3 p);
	half4 __ovld __cnfn normalize(half4 p);
	#endif //cl_khr_fp16

	/**
	* Returns fast_length(p0 - p1).
	*/
	float __ovld __cnfn fast_distance(float p0, float p1);
	float __ovld __cnfn fast_distance(float2 p0, float2 p1);
	float __ovld __cnfn fast_distance(float3 p0, float3 p1);
	float __ovld __cnfn fast_distance(float4 p0, float4 p1);
	#ifdef cl_khr_fp16
	half __ovld __cnfn fast_distance(half p0, half p1);
	half __ovld __cnfn fast_distance(half2 p0, half2 p1);
	half __ovld __cnfn fast_distance(half3 p0, half3 p1);
	half __ovld __cnfn fast_distance(half4 p0, half4 p1);
	#endif //cl_khr_fp16

	/**
	* Returns the length of vector p computed as:
	* half_sqrt(p.x2 + p.y2 + ...)
	*/
	float __ovld __cnfn fast_length(float p);
	float __ovld __cnfn fast_length(float2 p);
	float __ovld __cnfn fast_length(float3 p);
	float __ovld __cnfn fast_length(float4 p);
	#ifdef cl_khr_fp16
	half __ovld __cnfn fast_length(half p);
	half __ovld __cnfn fast_length(half2 p);
	half __ovld __cnfn fast_length(half3 p);
	half __ovld __cnfn fast_length(half4 p);
	#endif //cl_khr_fp16

	/**
	* Returns a vector in the same direction as p but with a
	* length of 1. fast_normalize is computed as:
	* p * half_rsqrt (p.x^2 + p.y^2 + ... )
	* The result shall be within 8192 ulps error from the
	* infinitely precise result of
	* if (all(p == 0.0f))
	* result = p;
	* else
	* result = p / sqrt (p.x^2 + p.y^2 + ...);
	* with the following exceptions:
	* 1) If the sum of squares is greater than FLT_MAX
	* then the value of the floating-point values in the
	* result vector are undefined.
	* 2) If the sum of squares is less than FLT_MIN then
	* the implementation may return back p.
	* 3) If the device is in "denorms are flushed to zero"
	* mode, individual operand elements with magnitude
	* less than sqrt(FLT_MIN) may be flushed to zero
	* before proceeding with the calculation.
	*/
	float __ovld __cnfn fast_normalize(float p);
	float2 __ovld __cnfn fast_normalize(float2 p);
	float3 __ovld __cnfn fast_normalize(float3 p);
	float4 __ovld __cnfn fast_normalize(float4 p);
	#ifdef cl_khr_fp16
	half __ovld __cnfn fast_normalize(half p);
	half2 __ovld __cnfn fast_normalize(half2 p);
	half3 __ovld __cnfn fast_normalize(half3 p);
	half4 __ovld __cnfn fast_normalize(half4 p);
	#endif //cl_khr_fp16

	// OpenCL v1.1 s6.11.6, v1.2 s6.12.6, v2.0 s6.13.6 - Relational Functions

	/**
	* intn isequal (floatn x, floatn y)
	* Returns the component-wise compare of x == y.
	*/
	int __ovld __cnfn isequal(float x, float y);
	int2 __ovld __cnfn isequal(float2 x, float2 y);
	int3 __ovld __cnfn isequal(float3 x, float3 y);
	int4 __ovld __cnfn isequal(float4 x, float4 y);
	int8 __ovld __cnfn isequal(float8 x, float8 y);
	int16 __ovld __cnfn isequal(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isequal(double x, double y);
	long2 __ovld __cnfn isequal(double2 x, double2 y);
	long3 __ovld __cnfn isequal(double3 x, double3 y);
	long4 __ovld __cnfn isequal(double4 x, double4 y);
	long8 __ovld __cnfn isequal(double8 x, double8 y);
	long16 __ovld __cnfn isequal(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isequal(half x, half y);
	short2 __ovld __cnfn isequal(half2 x, half2 y);
	short3 __ovld __cnfn isequal(half3 x, half3 y);
	short4 __ovld __cnfn isequal(half4 x, half4 y);
	short8 __ovld __cnfn isequal(half8 x, half8 y);
	short16 __ovld __cnfn isequal(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Returns the component-wise compare of x != y.
	*/
	int __ovld __cnfn isnotequal(float x, float y);
	int2 __ovld __cnfn isnotequal(float2 x, float2 y);
	int3 __ovld __cnfn isnotequal(float3 x, float3 y);
	int4 __ovld __cnfn isnotequal(float4 x, float4 y);
	int8 __ovld __cnfn isnotequal(float8 x, float8 y);
	int16 __ovld __cnfn isnotequal(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isnotequal(double x, double y);
	long2 __ovld __cnfn isnotequal(double2 x, double2 y);
	long3 __ovld __cnfn isnotequal(double3 x, double3 y);
	long4 __ovld __cnfn isnotequal(double4 x, double4 y);
	long8 __ovld __cnfn isnotequal(double8 x, double8 y);
	long16 __ovld __cnfn isnotequal(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isnotequal(half x, half y);
	short2 __ovld __cnfn isnotequal(half2 x, half2 y);
	short3 __ovld __cnfn isnotequal(half3 x, half3 y);
	short4 __ovld __cnfn isnotequal(half4 x, half4 y);
	short8 __ovld __cnfn isnotequal(half8 x, half8 y);
	short16 __ovld __cnfn isnotequal(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Returns the component-wise compare of x > y.
	*/
	int __ovld __cnfn isgreater(float x, float y);
	int2 __ovld __cnfn isgreater(float2 x, float2 y);
	int3 __ovld __cnfn isgreater(float3 x, float3 y);
	int4 __ovld __cnfn isgreater(float4 x, float4 y);
	int8 __ovld __cnfn isgreater(float8 x, float8 y);
	int16 __ovld __cnfn isgreater(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isgreater(double x, double y);
	long2 __ovld __cnfn isgreater(double2 x, double2 y);
	long3 __ovld __cnfn isgreater(double3 x, double3 y);
	long4 __ovld __cnfn isgreater(double4 x, double4 y);
	long8 __ovld __cnfn isgreater(double8 x, double8 y);
	long16 __ovld __cnfn isgreater(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isgreater(half x, half y);
	short2 __ovld __cnfn isgreater(half2 x, half2 y);
	short3 __ovld __cnfn isgreater(half3 x, half3 y);
	short4 __ovld __cnfn isgreater(half4 x, half4 y);
	short8 __ovld __cnfn isgreater(half8 x, half8 y);
	short16 __ovld __cnfn isgreater(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Returns the component-wise compare of x >= y.
	*/
	int __ovld __cnfn isgreaterequal(float x, float y);
	int2 __ovld __cnfn isgreaterequal(float2 x, float2 y);
	int3 __ovld __cnfn isgreaterequal(float3 x, float3 y);
	int4 __ovld __cnfn isgreaterequal(float4 x, float4 y);
	int8 __ovld __cnfn isgreaterequal(float8 x, float8 y);
	int16 __ovld __cnfn isgreaterequal(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isgreaterequal(double x, double y);
	long2 __ovld __cnfn isgreaterequal(double2 x, double2 y);
	long3 __ovld __cnfn isgreaterequal(double3 x, double3 y);
	long4 __ovld __cnfn isgreaterequal(double4 x, double4 y);
	long8 __ovld __cnfn isgreaterequal(double8 x, double8 y);
	long16 __ovld __cnfn isgreaterequal(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isgreaterequal(half x, half y);
	short2 __ovld __cnfn isgreaterequal(half2 x, half2 y);
	short3 __ovld __cnfn isgreaterequal(half3 x, half3 y);
	short4 __ovld __cnfn isgreaterequal(half4 x, half4 y);
	short8 __ovld __cnfn isgreaterequal(half8 x, half8 y);
	short16 __ovld __cnfn isgreaterequal(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Returns the component-wise compare of x < y.
	*/
	int __ovld __cnfn isless(float x, float y);
	int2 __ovld __cnfn isless(float2 x, float2 y);
	int3 __ovld __cnfn isless(float3 x, float3 y);
	int4 __ovld __cnfn isless(float4 x, float4 y);
	int8 __ovld __cnfn isless(float8 x, float8 y);
	int16 __ovld __cnfn isless(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isless(double x, double y);
	long2 __ovld __cnfn isless(double2 x, double2 y);
	long3 __ovld __cnfn isless(double3 x, double3 y);
	long4 __ovld __cnfn isless(double4 x, double4 y);
	long8 __ovld __cnfn isless(double8 x, double8 y);
	long16 __ovld __cnfn isless(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isless(half x, half y);
	short2 __ovld __cnfn isless(half2 x, half2 y);
	short3 __ovld __cnfn isless(half3 x, half3 y);
	short4 __ovld __cnfn isless(half4 x, half4 y);
	short8 __ovld __cnfn isless(half8 x, half8 y);
	short16 __ovld __cnfn isless(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Returns the component-wise compare of x <= y.
	*/
	int __ovld __cnfn islessequal(float x, float y);
	int2 __ovld __cnfn islessequal(float2 x, float2 y);
	int3 __ovld __cnfn islessequal(float3 x, float3 y);
	int4 __ovld __cnfn islessequal(float4 x, float4 y);
	int8 __ovld __cnfn islessequal(float8 x, float8 y);
	int16 __ovld __cnfn islessequal(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn islessequal(double x, double y);
	long2 __ovld __cnfn islessequal(double2 x, double2 y);
	long3 __ovld __cnfn islessequal(double3 x, double3 y);
	long4 __ovld __cnfn islessequal(double4 x, double4 y);
	long8 __ovld __cnfn islessequal(double8 x, double8 y);
	long16 __ovld __cnfn islessequal(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn islessequal(half x, half y);
	short2 __ovld __cnfn islessequal(half2 x, half2 y);
	short3 __ovld __cnfn islessequal(half3 x, half3 y);
	short4 __ovld __cnfn islessequal(half4 x, half4 y);
	short8 __ovld __cnfn islessequal(half8 x, half8 y);
	short16 __ovld __cnfn islessequal(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Returns the component-wise compare of
	* (x < y) \|\| (x > y) .
	*/
	int __ovld __cnfn islessgreater(float x, float y);
	int2 __ovld __cnfn islessgreater(float2 x, float2 y);
	int3 __ovld __cnfn islessgreater(float3 x, float3 y);
	int4 __ovld __cnfn islessgreater(float4 x, float4 y);
	int8 __ovld __cnfn islessgreater(float8 x, float8 y);
	int16 __ovld __cnfn islessgreater(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn islessgreater(double x, double y);
	long2 __ovld __cnfn islessgreater(double2 x, double2 y);
	long3 __ovld __cnfn islessgreater(double3 x, double3 y);
	long4 __ovld __cnfn islessgreater(double4 x, double4 y);
	long8 __ovld __cnfn islessgreater(double8 x, double8 y);
	long16 __ovld __cnfn islessgreater(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn islessgreater(half x, half y);
	short2 __ovld __cnfn islessgreater(half2 x, half2 y);
	short3 __ovld __cnfn islessgreater(half3 x, half3 y);
	short4 __ovld __cnfn islessgreater(half4 x, half4 y);
	short8 __ovld __cnfn islessgreater(half8 x, half8 y);
	short16 __ovld __cnfn islessgreater(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Test for finite value.
	*/
	int __ovld __cnfn isfinite(float);
	int2 __ovld __cnfn isfinite(float2);
	int3 __ovld __cnfn isfinite(float3);
	int4 __ovld __cnfn isfinite(float4);
	int8 __ovld __cnfn isfinite(float8);
	int16 __ovld __cnfn isfinite(float16);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isfinite(double);
	long2 __ovld __cnfn isfinite(double2);
	long3 __ovld __cnfn isfinite(double3);
	long4 __ovld __cnfn isfinite(double4);
	long8 __ovld __cnfn isfinite(double8);
	long16 __ovld __cnfn isfinite(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isfinite(half);
	short2 __ovld __cnfn isfinite(half2);
	short3 __ovld __cnfn isfinite(half3);
	short4 __ovld __cnfn isfinite(half4);
	short8 __ovld __cnfn isfinite(half8);
	short16 __ovld __cnfn isfinite(half16);
	#endif //cl_khr_fp16

	/**
	* Test for infinity value (+ve or -ve) .
	*/
	int __ovld __cnfn isinf(float);
	int2 __ovld __cnfn isinf(float2);
	int3 __ovld __cnfn isinf(float3);
	int4 __ovld __cnfn isinf(float4);
	int8 __ovld __cnfn isinf(float8);
	int16 __ovld __cnfn isinf(float16);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isinf(double);
	long2 __ovld __cnfn isinf(double2);
	long3 __ovld __cnfn isinf(double3);
	long4 __ovld __cnfn isinf(double4);
	long8 __ovld __cnfn isinf(double8);
	long16 __ovld __cnfn isinf(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isinf(half);
	short2 __ovld __cnfn isinf(half2);
	short3 __ovld __cnfn isinf(half3);
	short4 __ovld __cnfn isinf(half4);
	short8 __ovld __cnfn isinf(half8);
	short16 __ovld __cnfn isinf(half16);
	#endif //cl_khr_fp16

	/**
	* Test for a NaN.
	*/
	int __ovld __cnfn isnan(float);
	int2 __ovld __cnfn isnan(float2);
	int3 __ovld __cnfn isnan(float3);
	int4 __ovld __cnfn isnan(float4);
	int8 __ovld __cnfn isnan(float8);
	int16 __ovld __cnfn isnan(float16);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isnan(double);
	long2 __ovld __cnfn isnan(double2);
	long3 __ovld __cnfn isnan(double3);
	long4 __ovld __cnfn isnan(double4);
	long8 __ovld __cnfn isnan(double8);
	long16 __ovld __cnfn isnan(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isnan(half);
	short2 __ovld __cnfn isnan(half2);
	short3 __ovld __cnfn isnan(half3);
	short4 __ovld __cnfn isnan(half4);
	short8 __ovld __cnfn isnan(half8);
	short16 __ovld __cnfn isnan(half16);
	#endif //cl_khr_fp16

	/**
	* Test for a normal value.
	*/
	int __ovld __cnfn isnormal(float);
	int2 __ovld __cnfn isnormal(float2);
	int3 __ovld __cnfn isnormal(float3);
	int4 __ovld __cnfn isnormal(float4);
	int8 __ovld __cnfn isnormal(float8);
	int16 __ovld __cnfn isnormal(float16);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isnormal(double);
	long2 __ovld __cnfn isnormal(double2);
	long3 __ovld __cnfn isnormal(double3);
	long4 __ovld __cnfn isnormal(double4);
	long8 __ovld __cnfn isnormal(double8);
	long16 __ovld __cnfn isnormal(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isnormal(half);
	short2 __ovld __cnfn isnormal(half2);
	short3 __ovld __cnfn isnormal(half3);
	short4 __ovld __cnfn isnormal(half4);
	short8 __ovld __cnfn isnormal(half8);
	short16 __ovld __cnfn isnormal(half16);
	#endif //cl_khr_fp16

	/**
	* Test if arguments are ordered. isordered() takes
	* arguments x and y, and returns the result
	* isequal(x, x) && isequal(y, y).
	*/
	int __ovld __cnfn isordered(float x, float y);
	int2 __ovld __cnfn isordered(float2 x, float2 y);
	int3 __ovld __cnfn isordered(float3 x, float3 y);
	int4 __ovld __cnfn isordered(float4 x, float4 y);
	int8 __ovld __cnfn isordered(float8 x, float8 y);
	int16 __ovld __cnfn isordered(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isordered(double x, double y);
	long2 __ovld __cnfn isordered(double2 x, double2 y);
	long3 __ovld __cnfn isordered(double3 x, double3 y);
	long4 __ovld __cnfn isordered(double4 x, double4 y);
	long8 __ovld __cnfn isordered(double8 x, double8 y);
	long16 __ovld __cnfn isordered(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isordered(half x, half y);
	short2 __ovld __cnfn isordered(half2 x, half2 y);
	short3 __ovld __cnfn isordered(half3 x, half3 y);
	short4 __ovld __cnfn isordered(half4 x, half4 y);
	short8 __ovld __cnfn isordered(half8 x, half8 y);
	short16 __ovld __cnfn isordered(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Test if arguments are unordered. isunordered()
	* takes arguments x and y, returning non-zero if x or y
	* is NaN, and zero otherwise.
	*/
	int __ovld __cnfn isunordered(float x, float y);
	int2 __ovld __cnfn isunordered(float2 x, float2 y);
	int3 __ovld __cnfn isunordered(float3 x, float3 y);
	int4 __ovld __cnfn isunordered(float4 x, float4 y);
	int8 __ovld __cnfn isunordered(float8 x, float8 y);
	int16 __ovld __cnfn isunordered(float16 x, float16 y);
	#ifdef cl_khr_fp64
	int __ovld __cnfn isunordered(double x, double y);
	long2 __ovld __cnfn isunordered(double2 x, double2 y);
	long3 __ovld __cnfn isunordered(double3 x, double3 y);
	long4 __ovld __cnfn isunordered(double4 x, double4 y);
	long8 __ovld __cnfn isunordered(double8 x, double8 y);
	long16 __ovld __cnfn isunordered(double16 x, double16 y);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn isunordered(half x, half y);
	short2 __ovld __cnfn isunordered(half2 x, half2 y);
	short3 __ovld __cnfn isunordered(half3 x, half3 y);
	short4 __ovld __cnfn isunordered(half4 x, half4 y);
	short8 __ovld __cnfn isunordered(half8 x, half8 y);
	short16 __ovld __cnfn isunordered(half16 x, half16 y);
	#endif //cl_khr_fp16

	/**
	* Test for sign bit. The scalar version of the function
	* returns a 1 if the sign bit in the float is set else returns
	* 0. The vector version of the function returns the
	* following for each component in floatn: a -1 if the
	* sign bit in the float is set else returns 0.
	*/
	int __ovld __cnfn signbit(float);
	int2 __ovld __cnfn signbit(float2);
	int3 __ovld __cnfn signbit(float3);
	int4 __ovld __cnfn signbit(float4);
	int8 __ovld __cnfn signbit(float8);
	int16 __ovld __cnfn signbit(float16);
	#ifdef cl_khr_fp64
	int __ovld __cnfn signbit(double);
	long2 __ovld __cnfn signbit(double2);
	long3 __ovld __cnfn signbit(double3);
	long4 __ovld __cnfn signbit(double4);
	long8 __ovld __cnfn signbit(double8);
	long16 __ovld __cnfn signbit(double16);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	int __ovld __cnfn signbit(half);
	short2 __ovld __cnfn signbit(half2);
	short3 __ovld __cnfn signbit(half3);
	short4 __ovld __cnfn signbit(half4);
	short8 __ovld __cnfn signbit(half8);
	short16 __ovld __cnfn signbit(half16);
	#endif //cl_khr_fp16

	/**
	* Returns 1 if the most significant bit in any component
	* of x is set; otherwise returns 0.
	*/
	int __ovld __cnfn any(char x);
	int __ovld __cnfn any(char2 x);
	int __ovld __cnfn any(char3 x);
	int __ovld __cnfn any(char4 x);
	int __ovld __cnfn any(char8 x);
	int __ovld __cnfn any(char16 x);
	int __ovld __cnfn any(short x);
	int __ovld __cnfn any(short2 x);
	int __ovld __cnfn any(short3 x);
	int __ovld __cnfn any(short4 x);
	int __ovld __cnfn any(short8 x);
	int __ovld __cnfn any(short16 x);
	int __ovld __cnfn any(int x);
	int __ovld __cnfn any(int2 x);
	int __ovld __cnfn any(int3 x);
	int __ovld __cnfn any(int4 x);
	int __ovld __cnfn any(int8 x);
	int __ovld __cnfn any(int16 x);
	int __ovld __cnfn any(long x);
	int __ovld __cnfn any(long2 x);
	int __ovld __cnfn any(long3 x);
	int __ovld __cnfn any(long4 x);
	int __ovld __cnfn any(long8 x);
	int __ovld __cnfn any(long16 x);

	/**
	* Returns 1 if the most significant bit in all components
	* of x is set; otherwise returns 0.
	*/
	int __ovld __cnfn all(char x);
	int __ovld __cnfn all(char2 x);
	int __ovld __cnfn all(char3 x);
	int __ovld __cnfn all(char4 x);
	int __ovld __cnfn all(char8 x);
	int __ovld __cnfn all(char16 x);
	int __ovld __cnfn all(short x);
	int __ovld __cnfn all(short2 x);
	int __ovld __cnfn all(short3 x);
	int __ovld __cnfn all(short4 x);
	int __ovld __cnfn all(short8 x);
	int __ovld __cnfn all(short16 x);
	int __ovld __cnfn all(int x);
	int __ovld __cnfn all(int2 x);
	int __ovld __cnfn all(int3 x);
	int __ovld __cnfn all(int4 x);
	int __ovld __cnfn all(int8 x);
	int __ovld __cnfn all(int16 x);
	int __ovld __cnfn all(long x);
	int __ovld __cnfn all(long2 x);
	int __ovld __cnfn all(long3 x);
	int __ovld __cnfn all(long4 x);
	int __ovld __cnfn all(long8 x);
	int __ovld __cnfn all(long16 x);

	/**
	* Each bit of the result is the corresponding bit of a if
	* the corresponding bit of c is 0. Otherwise it is the
	* corresponding bit of b.
	*/
	char __ovld __cnfn bitselect(char a, char b, char c);
	uchar __ovld __cnfn bitselect(uchar a, uchar b, uchar c);
	char2 __ovld __cnfn bitselect(char2 a, char2 b, char2 c);
	uchar2 __ovld __cnfn bitselect(uchar2 a, uchar2 b, uchar2 c);
	char3 __ovld __cnfn bitselect(char3 a, char3 b, char3 c);
	uchar3 __ovld __cnfn bitselect(uchar3 a, uchar3 b, uchar3 c);
	char4 __ovld __cnfn bitselect(char4 a, char4 b, char4 c);
	uchar4 __ovld __cnfn bitselect(uchar4 a, uchar4 b, uchar4 c);
	char8 __ovld __cnfn bitselect(char8 a, char8 b, char8 c);
	uchar8 __ovld __cnfn bitselect(uchar8 a, uchar8 b, uchar8 c);
	char16 __ovld __cnfn bitselect(char16 a, char16 b, char16 c);
	uchar16 __ovld __cnfn bitselect(uchar16 a, uchar16 b, uchar16 c);
	short __ovld __cnfn bitselect(short a, short b, short c);
	ushort __ovld __cnfn bitselect(ushort a, ushort b, ushort c);
	short2 __ovld __cnfn bitselect(short2 a, short2 b, short2 c);
	ushort2 __ovld __cnfn bitselect(ushort2 a, ushort2 b, ushort2 c);
	short3 __ovld __cnfn bitselect(short3 a, short3 b, short3 c);
	ushort3 __ovld __cnfn bitselect(ushort3 a, ushort3 b, ushort3 c);
	short4 __ovld __cnfn bitselect(short4 a, short4 b, short4 c);
	ushort4 __ovld __cnfn bitselect(ushort4 a, ushort4 b, ushort4 c);
	short8 __ovld __cnfn bitselect(short8 a, short8 b, short8 c);
	ushort8 __ovld __cnfn bitselect(ushort8 a, ushort8 b, ushort8 c);
	short16 __ovld __cnfn bitselect(short16 a, short16 b, short16 c);
	ushort16 __ovld __cnfn bitselect(ushort16 a, ushort16 b, ushort16 c);
	int __ovld __cnfn bitselect(int a, int b, int c);
	uint __ovld __cnfn bitselect(uint a, uint b, uint c);
	int2 __ovld __cnfn bitselect(int2 a, int2 b, int2 c);
	uint2 __ovld __cnfn bitselect(uint2 a, uint2 b, uint2 c);
	int3 __ovld __cnfn bitselect(int3 a, int3 b, int3 c);
	uint3 __ovld __cnfn bitselect(uint3 a, uint3 b, uint3 c);
	int4 __ovld __cnfn bitselect(int4 a, int4 b, int4 c);
	uint4 __ovld __cnfn bitselect(uint4 a, uint4 b, uint4 c);
	int8 __ovld __cnfn bitselect(int8 a, int8 b, int8 c);
	uint8 __ovld __cnfn bitselect(uint8 a, uint8 b, uint8 c);
	int16 __ovld __cnfn bitselect(int16 a, int16 b, int16 c);
	uint16 __ovld __cnfn bitselect(uint16 a, uint16 b, uint16 c);
	long __ovld __cnfn bitselect(long a, long b, long c);
	ulong __ovld __cnfn bitselect(ulong a, ulong b, ulong c);
	long2 __ovld __cnfn bitselect(long2 a, long2 b, long2 c);
	ulong2 __ovld __cnfn bitselect(ulong2 a, ulong2 b, ulong2 c);
	long3 __ovld __cnfn bitselect(long3 a, long3 b, long3 c);
	ulong3 __ovld __cnfn bitselect(ulong3 a, ulong3 b, ulong3 c);
	long4 __ovld __cnfn bitselect(long4 a, long4 b, long4 c);
	ulong4 __ovld __cnfn bitselect(ulong4 a, ulong4 b, ulong4 c);
	long8 __ovld __cnfn bitselect(long8 a, long8 b, long8 c);
	ulong8 __ovld __cnfn bitselect(ulong8 a, ulong8 b, ulong8 c);
	long16 __ovld __cnfn bitselect(long16 a, long16 b, long16 c);
	ulong16 __ovld __cnfn bitselect(ulong16 a, ulong16 b, ulong16 c);
	float __ovld __cnfn bitselect(float a, float b, float c);
	float2 __ovld __cnfn bitselect(float2 a, float2 b, float2 c);
	float3 __ovld __cnfn bitselect(float3 a, float3 b, float3 c);
	float4 __ovld __cnfn bitselect(float4 a, float4 b, float4 c);
	float8 __ovld __cnfn bitselect(float8 a, float8 b, float8 c);
	float16 __ovld __cnfn bitselect(float16 a, float16 b, float16 c);
	#ifdef cl_khr_fp64
	double __ovld __cnfn bitselect(double a, double b, double c);
	double2 __ovld __cnfn bitselect(double2 a, double2 b, double2 c);
	double3 __ovld __cnfn bitselect(double3 a, double3 b, double3 c);
	double4 __ovld __cnfn bitselect(double4 a, double4 b, double4 c);
	double8 __ovld __cnfn bitselect(double8 a, double8 b, double8 c);
	double16 __ovld __cnfn bitselect(double16 a, double16 b, double16 c);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn bitselect(half a, half b, half c);
	half2 __ovld __cnfn bitselect(half2 a, half2 b, half2 c);
	half3 __ovld __cnfn bitselect(half3 a, half3 b, half3 c);
	half4 __ovld __cnfn bitselect(half4 a, half4 b, half4 c);
	half8 __ovld __cnfn bitselect(half8 a, half8 b, half8 c);
	half16 __ovld __cnfn bitselect(half16 a, half16 b, half16 c);
	#endif //cl_khr_fp16

	/**
	* For each component of a vector type,
	* result[i] = if MSB of c[i] is set ? b[i] : a[i].
	* For a scalar type, result = c ? b : a.
	* b and a must have the same type.
	* c must have the same number of elements and bits as a.
	*/
	char __ovld __cnfn select(char a, char b, char c);
	uchar __ovld __cnfn select(uchar a, uchar b, char c);
	char2 __ovld __cnfn select(char2 a, char2 b, char2 c);
	uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, char2 c);
	char3 __ovld __cnfn select(char3 a, char3 b, char3 c);
	uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, char3 c);
	char4 __ovld __cnfn select(char4 a, char4 b, char4 c);
	uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, char4 c);
	char8 __ovld __cnfn select(char8 a, char8 b, char8 c);
	uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c);
	char16 __ovld __cnfn select(char16 a, char16 b, char16 c);
	uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c);

	short __ovld __cnfn select(short a, short b, short c);
	ushort __ovld __cnfn select(ushort a, ushort b, short c);
	short2 __ovld __cnfn select(short2 a, short2 b, short2 c);
	ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, short2 c);
	short3 __ovld __cnfn select(short3 a, short3 b, short3 c);
	ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, short3 c);
	short4 __ovld __cnfn select(short4 a, short4 b, short4 c);
	ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, short4 c);
	short8 __ovld __cnfn select(short8 a, short8 b, short8 c);
	ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c);
	short16 __ovld __cnfn select(short16 a, short16 b, short16 c);
	ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c);

	int __ovld __cnfn select(int a, int b, int c);
	uint __ovld __cnfn select(uint a, uint b, int c);
	int2 __ovld __cnfn select(int2 a, int2 b, int2 c);
	uint2 __ovld __cnfn select(uint2 a, uint2 b, int2 c);
	int3 __ovld __cnfn select(int3 a, int3 b, int3 c);
	uint3 __ovld __cnfn select(uint3 a, uint3 b, int3 c);
	int4 __ovld __cnfn select(int4 a, int4 b, int4 c);
	uint4 __ovld __cnfn select(uint4 a, uint4 b, int4 c);
	int8 __ovld __cnfn select(int8 a, int8 b, int8 c);
	uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c);
	int16 __ovld __cnfn select(int16 a, int16 b, int16 c);
	uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c);
	float __ovld __cnfn select(float a, float b, int c);
	float2 __ovld __cnfn select(float2 a, float2 b, int2 c);
	float3 __ovld __cnfn select(float3 a, float3 b, int3 c);
	float4 __ovld __cnfn select(float4 a, float4 b, int4 c);
	float8 __ovld __cnfn select(float8 a, float8 b, int8 c);
	float16 __ovld __cnfn select(float16 a, float16 b, int16 c);

	long __ovld __cnfn select(long a, long b, long c);
	ulong __ovld __cnfn select(ulong a, ulong b, long c);
	long2 __ovld __cnfn select(long2 a, long2 b, long2 c);
	ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, long2 c);
	long3 __ovld __cnfn select(long3 a, long3 b, long3 c);
	ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, long3 c);
	long4 __ovld __cnfn select(long4 a, long4 b, long4 c);
	ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, long4 c);
	long8 __ovld __cnfn select(long8 a, long8 b, long8 c);
	ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c);
	long16 __ovld __cnfn select(long16 a, long16 b, long16 c);
	ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c);

	char __ovld __cnfn select(char a, char b, uchar c);
	uchar __ovld __cnfn select(uchar a, uchar b, uchar c);
	char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c);
	uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uchar2 c);
	char3 __ovld __cnfn select(char3 a, char3 b, uchar3 c);
	uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uchar3 c);
	char4 __ovld __cnfn select(char4 a, char4 b, uchar4 c);
	uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uchar4 c);
	char8 __ovld __cnfn select(char8 a, char8 b, uchar8 c);
	uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c);
	char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c);
	uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c);

	short __ovld __cnfn select(short a, short b, ushort c);
	ushort __ovld __cnfn select(ushort a, ushort b, ushort c);
	short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c);
	ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ushort2 c);
	short3 __ovld __cnfn select(short3 a, short3 b, ushort3 c);
	ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ushort3 c);
	short4 __ovld __cnfn select(short4 a, short4 b, ushort4 c);
	ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ushort4 c);
	short8 __ovld __cnfn select(short8 a, short8 b, ushort8 c);
	ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c);
	short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c);
	ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c);

	int __ovld __cnfn select(int a, int b, uint c);
	uint __ovld __cnfn select(uint a, uint b, uint c);
	int2 __ovld __cnfn select(int2 a, int2 b, uint2 c);
	uint2 __ovld __cnfn select(uint2 a, uint2 b, uint2 c);
	int3 __ovld __cnfn select(int3 a, int3 b, uint3 c);
	uint3 __ovld __cnfn select(uint3 a, uint3 b, uint3 c);
	int4 __ovld __cnfn select(int4 a, int4 b, uint4 c);
	uint4 __ovld __cnfn select(uint4 a, uint4 b, uint4 c);
	int8 __ovld __cnfn select(int8 a, int8 b, uint8 c);
	uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c);
	int16 __ovld __cnfn select(int16 a, int16 b, uint16 c);
	uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c);
	float __ovld __cnfn select(float a, float b, uint c);
	float2 __ovld __cnfn select(float2 a, float2 b, uint2 c);
	float3 __ovld __cnfn select(float3 a, float3 b, uint3 c);
	float4 __ovld __cnfn select(float4 a, float4 b, uint4 c);
	float8 __ovld __cnfn select(float8 a, float8 b, uint8 c);
	float16 __ovld __cnfn select(float16 a, float16 b, uint16 c);

	long __ovld __cnfn select(long a, long b, ulong c);
	ulong __ovld __cnfn select(ulong a, ulong b, ulong c);
	long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c);
	ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ulong2 c);
	long3 __ovld __cnfn select(long3 a, long3 b, ulong3 c);
	ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ulong3 c);
	long4 __ovld __cnfn select(long4 a, long4 b, ulong4 c);
	ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ulong4 c);
	long8 __ovld __cnfn select(long8 a, long8 b, ulong8 c);
	ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c);
	long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c);
	ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c);

	#ifdef cl_khr_fp64
	double __ovld __cnfn select(double a, double b, long c);
	double2 __ovld __cnfn select(double2 a, double2 b, long2 c);
	double3 __ovld __cnfn select(double3 a, double3 b, long3 c);
	double4 __ovld __cnfn select(double4 a, double4 b, long4 c);
	double8 __ovld __cnfn select(double8 a, double8 b, long8 c);
	double16 __ovld __cnfn select(double16 a, double16 b, long16 c);
	double __ovld __cnfn select(double a, double b, ulong c);
	double2 __ovld __cnfn select(double2 a, double2 b, ulong2 c);
	double3 __ovld __cnfn select(double3 a, double3 b, ulong3 c);
	double4 __ovld __cnfn select(double4 a, double4 b, ulong4 c);
	double8 __ovld __cnfn select(double8 a, double8 b, ulong8 c);
	double16 __ovld __cnfn select(double16 a, double16 b, ulong16 c);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	half __ovld __cnfn select(half a, half b, short c);
	half2 __ovld __cnfn select(half2 a, half2 b, short2 c);
	half3 __ovld __cnfn select(half3 a, half3 b, short3 c);
	half4 __ovld __cnfn select(half4 a, half4 b, short4 c);
	half8 __ovld __cnfn select(half8 a, half8 b, short8 c);
	half16 __ovld __cnfn select(half16 a, half16 b, short16 c);
	half __ovld __cnfn select(half a, half b, ushort c);
	half2 __ovld __cnfn select(half2 a, half2 b, ushort2 c);
	half3 __ovld __cnfn select(half3 a, half3 b, ushort3 c);
	half4 __ovld __cnfn select(half4 a, half4 b, ushort4 c);
	half8 __ovld __cnfn select(half8 a, half8 b, ushort8 c);
	half16 __ovld __cnfn select(half16 a, half16 b, ushort16 c);
	#endif //cl_khr_fp16

	// OpenCL v1.1 s6.11.7, v1.2 s6.12.7, v2.0 s6.13.7 - Vector Data Load and Store Functions
	// OpenCL extensions v1.1 s9.6.6, v1.2 s9.5.6, v2.0 s9.4.6 - Vector Data Load and Store Functions for Half Type
	/**
	* Use generic type gentype to indicate the built-in data types
	* char, uchar, short, ushort, int, uint, long, ulong, float,
	* double or half.
	*
	* vloadn return sizeof (gentypen) bytes of data read from address (p + (offset * n)).
	*
	* vstoren write sizeof (gentypen) bytes given by data to address (p + (offset * n)).
	*
	* The address computed as (p + (offset * n)) must be
	* 8-bit aligned if gentype is char, uchar;
	* 16-bit aligned if gentype is short, ushort, half;
	* 32-bit aligned if gentype is int, uint, float;
	* 64-bit aligned if gentype is long, ulong, double.
	*/

	char2 __ovld vload2(size_t offset, const __constant char *p);
	uchar2 __ovld vload2(size_t offset, const __constant uchar *p);
	short2 __ovld vload2(size_t offset, const __constant short *p);
	ushort2 __ovld vload2(size_t offset, const __constant ushort *p);
	int2 __ovld vload2(size_t offset, const __constant int *p);
	uint2 __ovld vload2(size_t offset, const __constant uint *p);
	long2 __ovld vload2(size_t offset, const __constant long *p);
	ulong2 __ovld vload2(size_t offset, const __constant ulong *p);
	float2 __ovld vload2(size_t offset, const __constant float *p);
	char3 __ovld vload3(size_t offset, const __constant char *p);
	uchar3 __ovld vload3(size_t offset, const __constant uchar *p);
	short3 __ovld vload3(size_t offset, const __constant short *p);
	ushort3 __ovld vload3(size_t offset, const __constant ushort *p);
	int3 __ovld vload3(size_t offset, const __constant int *p);
	uint3 __ovld vload3(size_t offset, const __constant uint *p);
	long3 __ovld vload3(size_t offset, const __constant long *p);
	ulong3 __ovld vload3(size_t offset, const __constant ulong *p);
	float3 __ovld vload3(size_t offset, const __constant float *p);
	char4 __ovld vload4(size_t offset, const __constant char *p);
	uchar4 __ovld vload4(size_t offset, const __constant uchar *p);
	short4 __ovld vload4(size_t offset, const __constant short *p);
	ushort4 __ovld vload4(size_t offset, const __constant ushort *p);
	int4 __ovld vload4(size_t offset, const __constant int *p);
	uint4 __ovld vload4(size_t offset, const __constant uint *p);
	long4 __ovld vload4(size_t offset, const __constant long *p);
	ulong4 __ovld vload4(size_t offset, const __constant ulong *p);
	float4 __ovld vload4(size_t offset, const __constant float *p);
	char8 __ovld vload8(size_t offset, const __constant char *p);
	uchar8 __ovld vload8(size_t offset, const __constant uchar *p);
	short8 __ovld vload8(size_t offset, const __constant short *p);
	ushort8 __ovld vload8(size_t offset, const __constant ushort *p);
	int8 __ovld vload8(size_t offset, const __constant int *p);
	uint8 __ovld vload8(size_t offset, const __constant uint *p);
	long8 __ovld vload8(size_t offset, const __constant long *p);
	ulong8 __ovld vload8(size_t offset, const __constant ulong *p);
	float8 __ovld vload8(size_t offset, const __constant float *p);
	char16 __ovld vload16(size_t offset, const __constant char *p);
	uchar16 __ovld vload16(size_t offset, const __constant uchar *p);
	short16 __ovld vload16(size_t offset, const __constant short *p);
	ushort16 __ovld vload16(size_t offset, const __constant ushort *p);
	int16 __ovld vload16(size_t offset, const __constant int *p);
	uint16 __ovld vload16(size_t offset, const __constant uint *p);
	long16 __ovld vload16(size_t offset, const __constant long *p);
	ulong16 __ovld vload16(size_t offset, const __constant ulong *p);
	float16 __ovld vload16(size_t offset, const __constant float *p);
	#ifdef cl_khr_fp64
	double2 __ovld vload2(size_t offset, const __constant double *p);
	double3 __ovld vload3(size_t offset, const __constant double *p);
	double4 __ovld vload4(size_t offset, const __constant double *p);
	double8 __ovld vload8(size_t offset, const __constant double *p);
	double16 __ovld vload16(size_t offset, const __constant double *p);
	#endif //cl_khr_fp64

	#ifdef cl_khr_fp16
	half __ovld vload(size_t offset, const __constant half *p);
	half2 __ovld vload2(size_t offset, const __constant half *p);
	half3 __ovld vload3(size_t offset, const __constant half *p);
	half4 __ovld vload4(size_t offset, const __constant half *p);
	half8 __ovld vload8(size_t offset, const __constant half *p);
	half16 __ovld vload16(size_t offset, const __constant half *p);
	#endif //cl_khr_fp16

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	char2 __ovld vload2(size_t offset, const char *p);
	uchar2 __ovld vload2(size_t offset, const uchar *p);
	short2 __ovld vload2(size_t offset, const short *p);
	ushort2 __ovld vload2(size_t offset, const ushort *p);
	int2 __ovld vload2(size_t offset, const int *p);
	uint2 __ovld vload2(size_t offset, const uint *p);
	long2 __ovld vload2(size_t offset, const long *p);
	ulong2 __ovld vload2(size_t offset, const ulong *p);
	float2 __ovld vload2(size_t offset, const float *p);
	char3 __ovld vload3(size_t offset, const char *p);
	uchar3 __ovld vload3(size_t offset, const uchar *p);
	short3 __ovld vload3(size_t offset, const short *p);
	ushort3 __ovld vload3(size_t offset, const ushort *p);
	int3 __ovld vload3(size_t offset, const int *p);
	uint3 __ovld vload3(size_t offset, const uint *p);
	long3 __ovld vload3(size_t offset, const long *p);
	ulong3 __ovld vload3(size_t offset, const ulong *p);
	float3 __ovld vload3(size_t offset, const float *p);
	char4 __ovld vload4(size_t offset, const char *p);
	uchar4 __ovld vload4(size_t offset, const uchar *p);
	short4 __ovld vload4(size_t offset, const short *p);
	ushort4 __ovld vload4(size_t offset, const ushort *p);
	int4 __ovld vload4(size_t offset, const int *p);
	uint4 __ovld vload4(size_t offset, const uint *p);
	long4 __ovld vload4(size_t offset, const long *p);
	ulong4 __ovld vload4(size_t offset, const ulong *p);
	float4 __ovld vload4(size_t offset, const float *p);
	char8 __ovld vload8(size_t offset, const char *p);
	uchar8 __ovld vload8(size_t offset, const uchar *p);
	short8 __ovld vload8(size_t offset, const short *p);
	ushort8 __ovld vload8(size_t offset, const ushort *p);
	int8 __ovld vload8(size_t offset, const int *p);
	uint8 __ovld vload8(size_t offset, const uint *p);
	long8 __ovld vload8(size_t offset, const long *p);
	ulong8 __ovld vload8(size_t offset, const ulong *p);
	float8 __ovld vload8(size_t offset, const float *p);
	char16 __ovld vload16(size_t offset, const char *p);
	uchar16 __ovld vload16(size_t offset, const uchar *p);
	short16 __ovld vload16(size_t offset, const short *p);
	ushort16 __ovld vload16(size_t offset, const ushort *p);
	int16 __ovld vload16(size_t offset, const int *p);
	uint16 __ovld vload16(size_t offset, const uint *p);
	long16 __ovld vload16(size_t offset, const long *p);
	ulong16 __ovld vload16(size_t offset, const ulong *p);
	float16 __ovld vload16(size_t offset, const float *p);

	#ifdef cl_khr_fp64
	double2 __ovld vload2(size_t offset, const double *p);
	double3 __ovld vload3(size_t offset, const double *p);
	double4 __ovld vload4(size_t offset, const double *p);
	double8 __ovld vload8(size_t offset, const double *p);
	double16 __ovld vload16(size_t offset, const double *p);
	#endif //cl_khr_fp64

	#ifdef cl_khr_fp16
	half __ovld vload(size_t offset, const half *p);
	half2 __ovld vload2(size_t offset, const half *p);
	half3 __ovld vload3(size_t offset, const half *p);
	half4 __ovld vload4(size_t offset, const half *p);
	half8 __ovld vload8(size_t offset, const half *p);
	half16 __ovld vload16(size_t offset, const half *p);
	#endif //cl_khr_fp16
	#else
	char2 __ovld vload2(size_t offset, const __global char *p);
	uchar2 __ovld vload2(size_t offset, const __global uchar *p);
	short2 __ovld vload2(size_t offset, const __global short *p);
	ushort2 __ovld vload2(size_t offset, const __global ushort *p);
	int2 __ovld vload2(size_t offset, const __global int *p);
	uint2 __ovld vload2(size_t offset, const __global uint *p);
	long2 __ovld vload2(size_t offset, const __global long *p);
	ulong2 __ovld vload2(size_t offset, const __global ulong *p);
	float2 __ovld vload2(size_t offset, const __global float *p);
	char3 __ovld vload3(size_t offset, const __global char *p);
	uchar3 __ovld vload3(size_t offset, const __global uchar *p);
	short3 __ovld vload3(size_t offset, const __global short *p);
	ushort3 __ovld vload3(size_t offset, const __global ushort *p);
	int3 __ovld vload3(size_t offset, const __global int *p);
	uint3 __ovld vload3(size_t offset, const __global uint *p);
	long3 __ovld vload3(size_t offset, const __global long *p);
	ulong3 __ovld vload3(size_t offset, const __global ulong *p);
	float3 __ovld vload3(size_t offset, const __global float *p);
	char4 __ovld vload4(size_t offset, const __global char *p);
	uchar4 __ovld vload4(size_t offset, const __global uchar *p);
	short4 __ovld vload4(size_t offset, const __global short *p);
	ushort4 __ovld vload4(size_t offset, const __global ushort *p);
	int4 __ovld vload4(size_t offset, const __global int *p);
	uint4 __ovld vload4(size_t offset, const __global uint *p);
	long4 __ovld vload4(size_t offset, const __global long *p);
	ulong4 __ovld vload4(size_t offset, const __global ulong *p);
	float4 __ovld vload4(size_t offset, const __global float *p);
	char8 __ovld vload8(size_t offset, const __global char *p);
	uchar8 __ovld vload8(size_t offset, const __global uchar *p);
	short8 __ovld vload8(size_t offset, const __global short *p);
	ushort8 __ovld vload8(size_t offset, const __global ushort *p);
	int8 __ovld vload8(size_t offset, const __global int *p);
	uint8 __ovld vload8(size_t offset, const __global uint *p);
	long8 __ovld vload8(size_t offset, const __global long *p);
	ulong8 __ovld vload8(size_t offset, const __global ulong *p);
	float8 __ovld vload8(size_t offset, const __global float *p);
	char16 __ovld vload16(size_t offset, const __global char *p);
	uchar16 __ovld vload16(size_t offset, const __global uchar *p);
	short16 __ovld vload16(size_t offset, const __global short *p);
	ushort16 __ovld vload16(size_t offset, const __global ushort *p);
	int16 __ovld vload16(size_t offset, const __global int *p);
	uint16 __ovld vload16(size_t offset, const __global uint *p);
	long16 __ovld vload16(size_t offset, const __global long *p);
	ulong16 __ovld vload16(size_t offset, const __global ulong *p);
	float16 __ovld vload16(size_t offset, const __global float *p);
	char2 __ovld vload2(size_t offset, const __local char *p);
	uchar2 __ovld vload2(size_t offset, const __local uchar *p);
	short2 __ovld vload2(size_t offset, const __local short *p);
	ushort2 __ovld vload2(size_t offset, const __local ushort *p);
	int2 __ovld vload2(size_t offset, const __local int *p);
	uint2 __ovld vload2(size_t offset, const __local uint *p);
	long2 __ovld vload2(size_t offset, const __local long *p);
	ulong2 __ovld vload2(size_t offset, const __local ulong *p);
	float2 __ovld vload2(size_t offset, const __local float *p);
	char3 __ovld vload3(size_t offset, const __local char *p);
	uchar3 __ovld vload3(size_t offset, const __local uchar *p);
	short3 __ovld vload3(size_t offset, const __local short *p);
	ushort3 __ovld vload3(size_t offset, const __local ushort *p);
	int3 __ovld vload3(size_t offset, const __local int *p);
	uint3 __ovld vload3(size_t offset, const __local uint *p);
	long3 __ovld vload3(size_t offset, const __local long *p);
	ulong3 __ovld vload3(size_t offset, const __local ulong *p);
	float3 __ovld vload3(size_t offset, const __local float *p);
	char4 __ovld vload4(size_t offset, const __local char *p);
	uchar4 __ovld vload4(size_t offset, const __local uchar *p);
	short4 __ovld vload4(size_t offset, const __local short *p);
	ushort4 __ovld vload4(size_t offset, const __local ushort *p);
	int4 __ovld vload4(size_t offset, const __local int *p);
	uint4 __ovld vload4(size_t offset, const __local uint *p);
	long4 __ovld vload4(size_t offset, const __local long *p);
	ulong4 __ovld vload4(size_t offset, const __local ulong *p);
	float4 __ovld vload4(size_t offset, const __local float *p);
	char8 __ovld vload8(size_t offset, const __local char *p);
	uchar8 __ovld vload8(size_t offset, const __local uchar *p);
	short8 __ovld vload8(size_t offset, const __local short *p);
	ushort8 __ovld vload8(size_t offset, const __local ushort *p);
	int8 __ovld vload8(size_t offset, const __local int *p);
	uint8 __ovld vload8(size_t offset, const __local uint *p);
	long8 __ovld vload8(size_t offset, const __local long *p);
	ulong8 __ovld vload8(size_t offset, const __local ulong *p);
	float8 __ovld vload8(size_t offset, const __local float *p);
	char16 __ovld vload16(size_t offset, const __local char *p);
	uchar16 __ovld vload16(size_t offset, const __local uchar *p);
	short16 __ovld vload16(size_t offset, const __local short *p);
	ushort16 __ovld vload16(size_t offset, const __local ushort *p);
	int16 __ovld vload16(size_t offset, const __local int *p);
	uint16 __ovld vload16(size_t offset, const __local uint *p);
	long16 __ovld vload16(size_t offset, const __local long *p);
	ulong16 __ovld vload16(size_t offset, const __local ulong *p);
	float16 __ovld vload16(size_t offset, const __local float *p);
	char2 __ovld vload2(size_t offset, const __private char *p);
	uchar2 __ovld vload2(size_t offset, const __private uchar *p);
	short2 __ovld vload2(size_t offset, const __private short *p);
	ushort2 __ovld vload2(size_t offset, const __private ushort *p);
	int2 __ovld vload2(size_t offset, const __private int *p);
	uint2 __ovld vload2(size_t offset, const __private uint *p);
	long2 __ovld vload2(size_t offset, const __private long *p);
	ulong2 __ovld vload2(size_t offset, const __private ulong *p);
	float2 __ovld vload2(size_t offset, const __private float *p);
	char3 __ovld vload3(size_t offset, const __private char *p);
	uchar3 __ovld vload3(size_t offset, const __private uchar *p);
	short3 __ovld vload3(size_t offset, const __private short *p);
	ushort3 __ovld vload3(size_t offset, const __private ushort *p);
	int3 __ovld vload3(size_t offset, const __private int *p);
	uint3 __ovld vload3(size_t offset, const __private uint *p);
	long3 __ovld vload3(size_t offset, const __private long *p);
	ulong3 __ovld vload3(size_t offset, const __private ulong *p);
	float3 __ovld vload3(size_t offset, const __private float *p);
	char4 __ovld vload4(size_t offset, const __private char *p);
	uchar4 __ovld vload4(size_t offset, const __private uchar *p);
	short4 __ovld vload4(size_t offset, const __private short *p);
	ushort4 __ovld vload4(size_t offset, const __private ushort *p);
	int4 __ovld vload4(size_t offset, const __private int *p);
	uint4 __ovld vload4(size_t offset, const __private uint *p);
	long4 __ovld vload4(size_t offset, const __private long *p);
	ulong4 __ovld vload4(size_t offset, const __private ulong *p);
	float4 __ovld vload4(size_t offset, const __private float *p);
	char8 __ovld vload8(size_t offset, const __private char *p);
	uchar8 __ovld vload8(size_t offset, const __private uchar *p);
	short8 __ovld vload8(size_t offset, const __private short *p);
	ushort8 __ovld vload8(size_t offset, const __private ushort *p);
	int8 __ovld vload8(size_t offset, const __private int *p);
	uint8 __ovld vload8(size_t offset, const __private uint *p);
	long8 __ovld vload8(size_t offset, const __private long *p);
	ulong8 __ovld vload8(size_t offset, const __private ulong *p);
	float8 __ovld vload8(size_t offset, const __private float *p);
	char16 __ovld vload16(size_t offset, const __private char *p);
	uchar16 __ovld vload16(size_t offset, const __private uchar *p);
	short16 __ovld vload16(size_t offset, const __private short *p);
	ushort16 __ovld vload16(size_t offset, const __private ushort *p);
	int16 __ovld vload16(size_t offset, const __private int *p);
	uint16 __ovld vload16(size_t offset, const __private uint *p);
	long16 __ovld vload16(size_t offset, const __private long *p);
	ulong16 __ovld vload16(size_t offset, const __private ulong *p);
	float16 __ovld vload16(size_t offset, const __private float *p);

	#ifdef cl_khr_fp64
	double2 __ovld vload2(size_t offset, const __global double *p);
	double3 __ovld vload3(size_t offset, const __global double *p);
	double4 __ovld vload4(size_t offset, const __global double *p);
	double8 __ovld vload8(size_t offset, const __global double *p);
	double16 __ovld vload16(size_t offset, const __global double *p);
	double2 __ovld vload2(size_t offset, const __local double *p);
	double3 __ovld vload3(size_t offset, const __local double *p);
	double4 __ovld vload4(size_t offset, const __local double *p);
	double8 __ovld vload8(size_t offset, const __local double *p);
	double16 __ovld vload16(size_t offset, const __local double *p);
	double2 __ovld vload2(size_t offset, const __private double *p);
	double3 __ovld vload3(size_t offset, const __private double *p);
	double4 __ovld vload4(size_t offset, const __private double *p);
	double8 __ovld vload8(size_t offset, const __private double *p);
	double16 __ovld vload16(size_t offset, const __private double *p);
	#endif //cl_khr_fp64

	#ifdef cl_khr_fp16
	half __ovld vload(size_t offset, const __global half *p);
	half2 __ovld vload2(size_t offset, const __global half *p);
	half3 __ovld vload3(size_t offset, const __global half *p);
	half4 __ovld vload4(size_t offset, const __global half *p);
	half8 __ovld vload8(size_t offset, const __global half *p);
	half16 __ovld vload16(size_t offset, const __global half *p);
	half __ovld vload(size_t offset, const __local half *p);
	half2 __ovld vload2(size_t offset, const __local half *p);
	half3 __ovld vload3(size_t offset, const __local half *p);
	half4 __ovld vload4(size_t offset, const __local half *p);
	half8 __ovld vload8(size_t offset, const __local half *p);
	half16 __ovld vload16(size_t offset, const __local half *p);
	half __ovld vload(size_t offset, const __private half *p);
	half2 __ovld vload2(size_t offset, const __private half *p);
	half3 __ovld vload3(size_t offset, const __private half *p);
	half4 __ovld vload4(size_t offset, const __private half *p);
	half8 __ovld vload8(size_t offset, const __private half *p);
	half16 __ovld vload16(size_t offset, const __private half *p);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld vstore2(char2 data, size_t offset, char *p);
	void __ovld vstore2(uchar2 data, size_t offset, uchar *p);
	void __ovld vstore2(short2 data, size_t offset, short *p);
	void __ovld vstore2(ushort2 data, size_t offset, ushort *p);
	void __ovld vstore2(int2 data, size_t offset, int *p);
	void __ovld vstore2(uint2 data, size_t offset, uint *p);
	void __ovld vstore2(long2 data, size_t offset, long *p);
	void __ovld vstore2(ulong2 data, size_t offset, ulong *p);
	void __ovld vstore2(float2 data, size_t offset, float *p);
	void __ovld vstore3(char3 data, size_t offset, char *p);
	void __ovld vstore3(uchar3 data, size_t offset, uchar *p);
	void __ovld vstore3(short3 data, size_t offset, short *p);
	void __ovld vstore3(ushort3 data, size_t offset, ushort *p);
	void __ovld vstore3(int3 data, size_t offset, int *p);
	void __ovld vstore3(uint3 data, size_t offset, uint *p);
	void __ovld vstore3(long3 data, size_t offset, long *p);
	void __ovld vstore3(ulong3 data, size_t offset, ulong *p);
	void __ovld vstore3(float3 data, size_t offset, float *p);
	void __ovld vstore4(char4 data, size_t offset, char *p);
	void __ovld vstore4(uchar4 data, size_t offset, uchar *p);
	void __ovld vstore4(short4 data, size_t offset, short *p);
	void __ovld vstore4(ushort4 data, size_t offset, ushort *p);
	void __ovld vstore4(int4 data, size_t offset, int *p);
	void __ovld vstore4(uint4 data, size_t offset, uint *p);
	void __ovld vstore4(long4 data, size_t offset, long *p);
	void __ovld vstore4(ulong4 data, size_t offset, ulong *p);
	void __ovld vstore4(float4 data, size_t offset, float *p);
	void __ovld vstore8(char8 data, size_t offset, char *p);
	void __ovld vstore8(uchar8 data, size_t offset, uchar *p);
	void __ovld vstore8(short8 data, size_t offset, short *p);
	void __ovld vstore8(ushort8 data, size_t offset, ushort *p);
	void __ovld vstore8(int8 data, size_t offset, int *p);
	void __ovld vstore8(uint8 data, size_t offset, uint *p);
	void __ovld vstore8(long8 data, size_t offset, long *p);
	void __ovld vstore8(ulong8 data, size_t offset, ulong *p);
	void __ovld vstore8(float8 data, size_t offset, float *p);
	void __ovld vstore16(char16 data, size_t offset, char *p);
	void __ovld vstore16(uchar16 data, size_t offset, uchar *p);
	void __ovld vstore16(short16 data, size_t offset, short *p);
	void __ovld vstore16(ushort16 data, size_t offset, ushort *p);
	void __ovld vstore16(int16 data, size_t offset, int *p);
	void __ovld vstore16(uint16 data, size_t offset, uint *p);
	void __ovld vstore16(long16 data, size_t offset, long *p);
	void __ovld vstore16(ulong16 data, size_t offset, ulong *p);
	void __ovld vstore16(float16 data, size_t offset, float *p);
	#ifdef cl_khr_fp64
	void __ovld vstore2(double2 data, size_t offset, double *p);
	void __ovld vstore3(double3 data, size_t offset, double *p);
	void __ovld vstore4(double4 data, size_t offset, double *p);
	void __ovld vstore8(double8 data, size_t offset, double *p);
	void __ovld vstore16(double16 data, size_t offset, double *p);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	void __ovld vstore(half data, size_t offset, half *p);
	void __ovld vstore2(half2 data, size_t offset, half *p);
	void __ovld vstore3(half3 data, size_t offset, half *p);
	void __ovld vstore4(half4 data, size_t offset, half *p);
	void __ovld vstore8(half8 data, size_t offset, half *p);
	void __ovld vstore16(half16 data, size_t offset, half *p);
	#endif //cl_khr_fp16
	#else
	void __ovld vstore2(char2 data, size_t offset, __global char *p);
	void __ovld vstore2(uchar2 data, size_t offset, __global uchar *p);
	void __ovld vstore2(short2 data, size_t offset, __global short *p);
	void __ovld vstore2(ushort2 data, size_t offset, __global ushort *p);
	void __ovld vstore2(int2 data, size_t offset, __global int *p);
	void __ovld vstore2(uint2 data, size_t offset, __global uint *p);
	void __ovld vstore2(long2 data, size_t offset, __global long *p);
	void __ovld vstore2(ulong2 data, size_t offset, __global ulong *p);
	void __ovld vstore2(float2 data, size_t offset, __global float *p);
	void __ovld vstore3(char3 data, size_t offset, __global char *p);
	void __ovld vstore3(uchar3 data, size_t offset, __global uchar *p);
	void __ovld vstore3(short3 data, size_t offset, __global short *p);
	void __ovld vstore3(ushort3 data, size_t offset, __global ushort *p);
	void __ovld vstore3(int3 data, size_t offset, __global int *p);
	void __ovld vstore3(uint3 data, size_t offset, __global uint *p);
	void __ovld vstore3(long3 data, size_t offset, __global long *p);
	void __ovld vstore3(ulong3 data, size_t offset, __global ulong *p);
	void __ovld vstore3(float3 data, size_t offset, __global float *p);
	void __ovld vstore4(char4 data, size_t offset, __global char *p);
	void __ovld vstore4(uchar4 data, size_t offset, __global uchar *p);
	void __ovld vstore4(short4 data, size_t offset, __global short *p);
	void __ovld vstore4(ushort4 data, size_t offset, __global ushort *p);
	void __ovld vstore4(int4 data, size_t offset, __global int *p);
	void __ovld vstore4(uint4 data, size_t offset, __global uint *p);
	void __ovld vstore4(long4 data, size_t offset, __global long *p);
	void __ovld vstore4(ulong4 data, size_t offset, __global ulong *p);
	void __ovld vstore4(float4 data, size_t offset, __global float *p);
	void __ovld vstore8(char8 data, size_t offset, __global char *p);
	void __ovld vstore8(uchar8 data, size_t offset, __global uchar *p);
	void __ovld vstore8(short8 data, size_t offset, __global short *p);
	void __ovld vstore8(ushort8 data, size_t offset, __global ushort *p);
	void __ovld vstore8(int8 data, size_t offset, __global int *p);
	void __ovld vstore8(uint8 data, size_t offset, __global uint *p);
	void __ovld vstore8(long8 data, size_t offset, __global long *p);
	void __ovld vstore8(ulong8 data, size_t offset, __global ulong *p);
	void __ovld vstore8(float8 data, size_t offset, __global float *p);
	void __ovld vstore16(char16 data, size_t offset, __global char *p);
	void __ovld vstore16(uchar16 data, size_t offset, __global uchar *p);
	void __ovld vstore16(short16 data, size_t offset, __global short *p);
	void __ovld vstore16(ushort16 data, size_t offset, __global ushort *p);
	void __ovld vstore16(int16 data, size_t offset, __global int *p);
	void __ovld vstore16(uint16 data, size_t offset, __global uint *p);
	void __ovld vstore16(long16 data, size_t offset, __global long *p);
	void __ovld vstore16(ulong16 data, size_t offset, __global ulong *p);
	void __ovld vstore16(float16 data, size_t offset, __global float *p);
	void __ovld vstore2(char2 data, size_t offset, __local char *p);
	void __ovld vstore2(uchar2 data, size_t offset, __local uchar *p);
	void __ovld vstore2(short2 data, size_t offset, __local short *p);
	void __ovld vstore2(ushort2 data, size_t offset, __local ushort *p);
	void __ovld vstore2(int2 data, size_t offset, __local int *p);
	void __ovld vstore2(uint2 data, size_t offset, __local uint *p);
	void __ovld vstore2(long2 data, size_t offset, __local long *p);
	void __ovld vstore2(ulong2 data, size_t offset, __local ulong *p);
	void __ovld vstore2(float2 data, size_t offset, __local float *p);
	void __ovld vstore3(char3 data, size_t offset, __local char *p);
	void __ovld vstore3(uchar3 data, size_t offset, __local uchar *p);
	void __ovld vstore3(short3 data, size_t offset, __local short *p);
	void __ovld vstore3(ushort3 data, size_t offset, __local ushort *p);
	void __ovld vstore3(int3 data, size_t offset, __local int *p);
	void __ovld vstore3(uint3 data, size_t offset, __local uint *p);
	void __ovld vstore3(long3 data, size_t offset, __local long *p);
	void __ovld vstore3(ulong3 data, size_t offset, __local ulong *p);
	void __ovld vstore3(float3 data, size_t offset, __local float *p);
	void __ovld vstore4(char4 data, size_t offset, __local char *p);
	void __ovld vstore4(uchar4 data, size_t offset, __local uchar *p);
	void __ovld vstore4(short4 data, size_t offset, __local short *p);
	void __ovld vstore4(ushort4 data, size_t offset, __local ushort *p);
	void __ovld vstore4(int4 data, size_t offset, __local int *p);
	void __ovld vstore4(uint4 data, size_t offset, __local uint *p);
	void __ovld vstore4(long4 data, size_t offset, __local long *p);
	void __ovld vstore4(ulong4 data, size_t offset, __local ulong *p);
	void __ovld vstore4(float4 data, size_t offset, __local float *p);
	void __ovld vstore8(char8 data, size_t offset, __local char *p);
	void __ovld vstore8(uchar8 data, size_t offset, __local uchar *p);
	void __ovld vstore8(short8 data, size_t offset, __local short *p);
	void __ovld vstore8(ushort8 data, size_t offset, __local ushort *p);
	void __ovld vstore8(int8 data, size_t offset, __local int *p);
	void __ovld vstore8(uint8 data, size_t offset, __local uint *p);
	void __ovld vstore8(long8 data, size_t offset, __local long *p);
	void __ovld vstore8(ulong8 data, size_t offset, __local ulong *p);
	void __ovld vstore8(float8 data, size_t offset, __local float *p);
	void __ovld vstore16(char16 data, size_t offset, __local char *p);
	void __ovld vstore16(uchar16 data, size_t offset, __local uchar *p);
	void __ovld vstore16(short16 data, size_t offset, __local short *p);
	void __ovld vstore16(ushort16 data, size_t offset, __local ushort *p);
	void __ovld vstore16(int16 data, size_t offset, __local int *p);
	void __ovld vstore16(uint16 data, size_t offset, __local uint *p);
	void __ovld vstore16(long16 data, size_t offset, __local long *p);
	void __ovld vstore16(ulong16 data, size_t offset, __local ulong *p);
	void __ovld vstore16(float16 data, size_t offset, __local float *p);
	void __ovld vstore2(char2 data, size_t offset, __private char *p);
	void __ovld vstore2(uchar2 data, size_t offset, __private uchar *p);
	void __ovld vstore2(short2 data, size_t offset, __private short *p);
	void __ovld vstore2(ushort2 data, size_t offset, __private ushort *p);
	void __ovld vstore2(int2 data, size_t offset, __private int *p);
	void __ovld vstore2(uint2 data, size_t offset, __private uint *p);
	void __ovld vstore2(long2 data, size_t offset, __private long *p);
	void __ovld vstore2(ulong2 data, size_t offset, __private ulong *p);
	void __ovld vstore2(float2 data, size_t offset, __private float *p);
	void __ovld vstore3(char3 data, size_t offset, __private char *p);
	void __ovld vstore3(uchar3 data, size_t offset, __private uchar *p);
	void __ovld vstore3(short3 data, size_t offset, __private short *p);
	void __ovld vstore3(ushort3 data, size_t offset, __private ushort *p);
	void __ovld vstore3(int3 data, size_t offset, __private int *p);
	void __ovld vstore3(uint3 data, size_t offset, __private uint *p);
	void __ovld vstore3(long3 data, size_t offset, __private long *p);
	void __ovld vstore3(ulong3 data, size_t offset, __private ulong *p);
	void __ovld vstore3(float3 data, size_t offset, __private float *p);
	void __ovld vstore4(char4 data, size_t offset, __private char *p);
	void __ovld vstore4(uchar4 data, size_t offset, __private uchar *p);
	void __ovld vstore4(short4 data, size_t offset, __private short *p);
	void __ovld vstore4(ushort4 data, size_t offset, __private ushort *p);
	void __ovld vstore4(int4 data, size_t offset, __private int *p);
	void __ovld vstore4(uint4 data, size_t offset, __private uint *p);
	void __ovld vstore4(long4 data, size_t offset, __private long *p);
	void __ovld vstore4(ulong4 data, size_t offset, __private ulong *p);
	void __ovld vstore4(float4 data, size_t offset, __private float *p);
	void __ovld vstore8(char8 data, size_t offset, __private char *p);
	void __ovld vstore8(uchar8 data, size_t offset, __private uchar *p);
	void __ovld vstore8(short8 data, size_t offset, __private short *p);
	void __ovld vstore8(ushort8 data, size_t offset, __private ushort *p);
	void __ovld vstore8(int8 data, size_t offset, __private int *p);
	void __ovld vstore8(uint8 data, size_t offset, __private uint *p);
	void __ovld vstore8(long8 data, size_t offset, __private long *p);
	void __ovld vstore8(ulong8 data, size_t offset, __private ulong *p);
	void __ovld vstore8(float8 data, size_t offset, __private float *p);
	void __ovld vstore16(char16 data, size_t offset, __private char *p);
	void __ovld vstore16(uchar16 data, size_t offset, __private uchar *p);
	void __ovld vstore16(short16 data, size_t offset, __private short *p);
	void __ovld vstore16(ushort16 data, size_t offset, __private ushort *p);
	void __ovld vstore16(int16 data, size_t offset, __private int *p);
	void __ovld vstore16(uint16 data, size_t offset, __private uint *p);
	void __ovld vstore16(long16 data, size_t offset, __private long *p);
	void __ovld vstore16(ulong16 data, size_t offset, __private ulong *p);
	void __ovld vstore16(float16 data, size_t offset, __private float *p);
	#ifdef cl_khr_fp64
	void __ovld vstore2(double2 data, size_t offset, __global double *p);
	void __ovld vstore3(double3 data, size_t offset, __global double *p);
	void __ovld vstore4(double4 data, size_t offset, __global double *p);
	void __ovld vstore8(double8 data, size_t offset, __global double *p);
	void __ovld vstore16(double16 data, size_t offset, __global double *p);
	void __ovld vstore2(double2 data, size_t offset, __local double *p);
	void __ovld vstore3(double3 data, size_t offset, __local double *p);
	void __ovld vstore4(double4 data, size_t offset, __local double *p);
	void __ovld vstore8(double8 data, size_t offset, __local double *p);
	void __ovld vstore16(double16 data, size_t offset, __local double *p);
	void __ovld vstore2(double2 data, size_t offset, __private double *p);
	void __ovld vstore3(double3 data, size_t offset, __private double *p);
	void __ovld vstore4(double4 data, size_t offset, __private double *p);
	void __ovld vstore8(double8 data, size_t offset, __private double *p);
	void __ovld vstore16(double16 data, size_t offset, __private double *p);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	void __ovld vstore(half data, size_t offset, __global half *p);
	void __ovld vstore2(half2 data, size_t offset, __global half *p);
	void __ovld vstore3(half3 data, size_t offset, __global half *p);
	void __ovld vstore4(half4 data, size_t offset, __global half *p);
	void __ovld vstore8(half8 data, size_t offset, __global half *p);
	void __ovld vstore16(half16 data, size_t offset, __global half *p);
	void __ovld vstore(half data, size_t offset, __local half *p);
	void __ovld vstore2(half2 data, size_t offset, __local half *p);
	void __ovld vstore3(half3 data, size_t offset, __local half *p);
	void __ovld vstore4(half4 data, size_t offset, __local half *p);
	void __ovld vstore8(half8 data, size_t offset, __local half *p);
	void __ovld vstore16(half16 data, size_t offset, __local half *p);
	void __ovld vstore(half data, size_t offset, __private half *p);
	void __ovld vstore2(half2 data, size_t offset, __private half *p);
	void __ovld vstore3(half3 data, size_t offset, __private half *p);
	void __ovld vstore4(half4 data, size_t offset, __private half *p);
	void __ovld vstore8(half8 data, size_t offset, __private half *p);
	void __ovld vstore16(half16 data, size_t offset, __private half *p);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Read sizeof (half) bytes of data from address
	* (p + offset). The data read is interpreted as a
	* half value. The half value is converted to a
	* float value and the float value is returned.
	* The read address computed as (p + offset)
	* must be 16-bit aligned.
	*/
	float __ovld vload_half(size_t offset, const __constant half *p);
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float __ovld vload_half(size_t offset, const half *p);
	#else
	float __ovld vload_half(size_t offset, const __global half *p);
	float __ovld vload_half(size_t offset, const __local half *p);
	float __ovld vload_half(size_t offset, const __private half *p);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Read sizeof (halfn) bytes of data from address
	* (p + (offset * n)). The data read is interpreted
	* as a halfn value. The halfn value read is
	* converted to a floatn value and the floatn
	* value is returned. The read address computed
	* as (p + (offset * n)) must be 16-bit aligned.
	*/
	float2 __ovld vload_half2(size_t offset, const __constant half *p);
	float3 __ovld vload_half3(size_t offset, const __constant half *p);
	float4 __ovld vload_half4(size_t offset, const __constant half *p);
	float8 __ovld vload_half8(size_t offset, const __constant half *p);
	float16 __ovld vload_half16(size_t offset, const __constant half *p);
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float2 __ovld vload_half2(size_t offset, const half *p);
	float3 __ovld vload_half3(size_t offset, const half *p);
	float4 __ovld vload_half4(size_t offset, const half *p);
	float8 __ovld vload_half8(size_t offset, const half *p);
	float16 __ovld vload_half16(size_t offset, const half *p);
	#else
	float2 __ovld vload_half2(size_t offset, const __global half *p);
	float3 __ovld vload_half3(size_t offset, const __global half *p);
	float4 __ovld vload_half4(size_t offset, const __global half *p);
	float8 __ovld vload_half8(size_t offset, const __global half *p);
	float16 __ovld vload_half16(size_t offset, const __global half *p);
	float2 __ovld vload_half2(size_t offset, const __local half *p);
	float3 __ovld vload_half3(size_t offset, const __local half *p);
	float4 __ovld vload_half4(size_t offset, const __local half *p);
	float8 __ovld vload_half8(size_t offset, const __local half *p);
	float16 __ovld vload_half16(size_t offset, const __local half *p);
	float2 __ovld vload_half2(size_t offset, const __private half *p);
	float3 __ovld vload_half3(size_t offset, const __private half *p);
	float4 __ovld vload_half4(size_t offset, const __private half *p);
	float8 __ovld vload_half8(size_t offset, const __private half *p);
	float16 __ovld vload_half16(size_t offset, const __private half *p);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* The float value given by data is first
	* converted to a half value using the appropriate
	* rounding mode. The half value is then written
	* to address computed as (p + offset). The
	* address computed as (p + offset) must be 16-
	* bit aligned.
	* vstore_half use the current rounding mode.
	* The default current rounding mode is round to
	* nearest even.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld vstore_half(float data, size_t offset, half *p);
	void __ovld vstore_half_rte(float data, size_t offset, half *p);
	void __ovld vstore_half_rtz(float data, size_t offset, half *p);
	void __ovld vstore_half_rtp(float data, size_t offset, half *p);
	void __ovld vstore_half_rtn(float data, size_t offset, half *p);
	#ifdef cl_khr_fp64
	void __ovld vstore_half(double data, size_t offset, half *p);
	void __ovld vstore_half_rte(double data, size_t offset, half *p);
	void __ovld vstore_half_rtz(double data, size_t offset, half *p);
	void __ovld vstore_half_rtp(double data, size_t offset, half *p);
	void __ovld vstore_half_rtn(double data, size_t offset, half *p);
	#endif //cl_khr_fp64
	#else
	void __ovld vstore_half(float data, size_t offset, __global half *p);
	void __ovld vstore_half_rte(float data, size_t offset, __global half *p);
	void __ovld vstore_half_rtz(float data, size_t offset, __global half *p);
	void __ovld vstore_half_rtp(float data, size_t offset, __global half *p);
	void __ovld vstore_half_rtn(float data, size_t offset, __global half *p);
	void __ovld vstore_half(float data, size_t offset, __local half *p);
	void __ovld vstore_half_rte(float data, size_t offset, __local half *p);
	void __ovld vstore_half_rtz(float data, size_t offset, __local half *p);
	void __ovld vstore_half_rtp(float data, size_t offset, __local half *p);
	void __ovld vstore_half_rtn(float data, size_t offset, __local half *p);
	void __ovld vstore_half(float data, size_t offset, __private half *p);
	void __ovld vstore_half_rte(float data, size_t offset, __private half *p);
	void __ovld vstore_half_rtz(float data, size_t offset, __private half *p);
	void __ovld vstore_half_rtp(float data, size_t offset, __private half *p);
	void __ovld vstore_half_rtn(float data, size_t offset, __private half *p);
	#ifdef cl_khr_fp64
	void __ovld vstore_half(double data, size_t offset, __global half *p);
	void __ovld vstore_half_rte(double data, size_t offset, __global half *p);
	void __ovld vstore_half_rtz(double data, size_t offset, __global half *p);
	void __ovld vstore_half_rtp(double data, size_t offset, __global half *p);
	void __ovld vstore_half_rtn(double data, size_t offset, __global half *p);
	void __ovld vstore_half(double data, size_t offset, __local half *p);
	void __ovld vstore_half_rte(double data, size_t offset, __local half *p);
	void __ovld vstore_half_rtz(double data, size_t offset, __local half *p);
	void __ovld vstore_half_rtp(double data, size_t offset, __local half *p);
	void __ovld vstore_half_rtn(double data, size_t offset, __local half *p);
	void __ovld vstore_half(double data, size_t offset, __private half *p);
	void __ovld vstore_half_rte(double data, size_t offset, __private half *p);
	void __ovld vstore_half_rtz(double data, size_t offset, __private half *p);
	void __ovld vstore_half_rtp(double data, size_t offset, __private half *p);
	void __ovld vstore_half_rtn(double data, size_t offset, __private half *p);
	#endif //cl_khr_fp64
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* The floatn value given by data is converted to
	* a halfn value using the appropriate rounding
	* mode. The halfn value is then written to
	* address computed as (p + (offset * n)). The
	* address computed as (p + (offset * n)) must be
	* 16-bit aligned.
	* vstore_halfn uses the current rounding mode.
	* The default current rounding mode is round to
	* nearest even.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld vstore_half2(float2 data, size_t offset, half *p);
	void __ovld vstore_half3(float3 data, size_t offset, half *p);
	void __ovld vstore_half4(float4 data, size_t offset, half *p);
	void __ovld vstore_half8(float8 data, size_t offset, half *p);
	void __ovld vstore_half16(float16 data, size_t offset, half *p);
	void __ovld vstore_half2_rte(float2 data, size_t offset, half *p);
	void __ovld vstore_half3_rte(float3 data, size_t offset, half *p);
	void __ovld vstore_half4_rte(float4 data, size_t offset, half *p);
	void __ovld vstore_half8_rte(float8 data, size_t offset, half *p);
	void __ovld vstore_half16_rte(float16 data, size_t offset, half *p);
	void __ovld vstore_half2_rtz(float2 data, size_t offset, half *p);
	void __ovld vstore_half3_rtz(float3 data, size_t offset, half *p);
	void __ovld vstore_half4_rtz(float4 data, size_t offset, half *p);
	void __ovld vstore_half8_rtz(float8 data, size_t offset, half *p);
	void __ovld vstore_half16_rtz(float16 data, size_t offset, half *p);
	void __ovld vstore_half2_rtp(float2 data, size_t offset, half *p);
	void __ovld vstore_half3_rtp(float3 data, size_t offset, half *p);
	void __ovld vstore_half4_rtp(float4 data, size_t offset, half *p);
	void __ovld vstore_half8_rtp(float8 data, size_t offset, half *p);
	void __ovld vstore_half16_rtp(float16 data, size_t offset, half *p);
	void __ovld vstore_half2_rtn(float2 data, size_t offset, half *p);
	void __ovld vstore_half3_rtn(float3 data, size_t offset, half *p);
	void __ovld vstore_half4_rtn(float4 data, size_t offset, half *p);
	void __ovld vstore_half8_rtn(float8 data, size_t offset, half *p);
	void __ovld vstore_half16_rtn(float16 data, size_t offset, half *p);
	#ifdef cl_khr_fp64
	void __ovld vstore_half2(double2 data, size_t offset, half *p);
	void __ovld vstore_half3(double3 data, size_t offset, half *p);
	void __ovld vstore_half4(double4 data, size_t offset, half *p);
	void __ovld vstore_half8(double8 data, size_t offset, half *p);
	void __ovld vstore_half16(double16 data, size_t offset, half *p);
	void __ovld vstore_half2_rte(double2 data, size_t offset, half *p);
	void __ovld vstore_half3_rte(double3 data, size_t offset, half *p);
	void __ovld vstore_half4_rte(double4 data, size_t offset, half *p);
	void __ovld vstore_half8_rte(double8 data, size_t offset, half *p);
	void __ovld vstore_half16_rte(double16 data, size_t offset, half *p);
	void __ovld vstore_half2_rtz(double2 data, size_t offset, half *p);
	void __ovld vstore_half3_rtz(double3 data, size_t offset, half *p);
	void __ovld vstore_half4_rtz(double4 data, size_t offset, half *p);
	void __ovld vstore_half8_rtz(double8 data, size_t offset, half *p);
	void __ovld vstore_half16_rtz(double16 data, size_t offset, half *p);
	void __ovld vstore_half2_rtp(double2 data, size_t offset, half *p);
	void __ovld vstore_half3_rtp(double3 data, size_t offset, half *p);
	void __ovld vstore_half4_rtp(double4 data, size_t offset, half *p);
	void __ovld vstore_half8_rtp(double8 data, size_t offset, half *p);
	void __ovld vstore_half16_rtp(double16 data, size_t offset, half *p);
	void __ovld vstore_half2_rtn(double2 data, size_t offset, half *p);
	void __ovld vstore_half3_rtn(double3 data, size_t offset, half *p);
	void __ovld vstore_half4_rtn(double4 data, size_t offset, half *p);
	void __ovld vstore_half8_rtn(double8 data, size_t offset, half *p);
	void __ovld vstore_half16_rtn(double16 data, size_t offset, half *p);
	#endif //cl_khr_fp64
	#else
	void __ovld vstore_half2(float2 data, size_t offset, __global half *p);
	void __ovld vstore_half3(float3 data, size_t offset, __global half *p);
	void __ovld vstore_half4(float4 data, size_t offset, __global half *p);
	void __ovld vstore_half8(float8 data, size_t offset, __global half *p);
	void __ovld vstore_half16(float16 data, size_t offset, __global half *p);
	void __ovld vstore_half2_rte(float2 data, size_t offset, __global half *p);
	void __ovld vstore_half3_rte(float3 data, size_t offset, __global half *p);
	void __ovld vstore_half4_rte(float4 data, size_t offset, __global half *p);
	void __ovld vstore_half8_rte(float8 data, size_t offset, __global half *p);
	void __ovld vstore_half16_rte(float16 data, size_t offset, __global half *p);
	void __ovld vstore_half2_rtz(float2 data, size_t offset, __global half *p);
	void __ovld vstore_half3_rtz(float3 data, size_t offset, __global half *p);
	void __ovld vstore_half4_rtz(float4 data, size_t offset, __global half *p);
	void __ovld vstore_half8_rtz(float8 data, size_t offset, __global half *p);
	void __ovld vstore_half16_rtz(float16 data, size_t offset, __global half *p);
	void __ovld vstore_half2_rtp(float2 data, size_t offset, __global half *p);
	void __ovld vstore_half3_rtp(float3 data, size_t offset, __global half *p);
	void __ovld vstore_half4_rtp(float4 data, size_t offset, __global half *p);
	void __ovld vstore_half8_rtp(float8 data, size_t offset, __global half *p);
	void __ovld vstore_half16_rtp(float16 data, size_t offset, __global half *p);
	void __ovld vstore_half2_rtn(float2 data, size_t offset, __global half *p);
	void __ovld vstore_half3_rtn(float3 data, size_t offset, __global half *p);
	void __ovld vstore_half4_rtn(float4 data, size_t offset, __global half *p);
	void __ovld vstore_half8_rtn(float8 data, size_t offset, __global half *p);
	void __ovld vstore_half16_rtn(float16 data, size_t offset, __global half *p);
	void __ovld vstore_half2(float2 data, size_t offset, __local half *p);
	void __ovld vstore_half3(float3 data, size_t offset, __local half *p);
	void __ovld vstore_half4(float4 data, size_t offset, __local half *p);
	void __ovld vstore_half8(float8 data, size_t offset, __local half *p);
	void __ovld vstore_half16(float16 data, size_t offset, __local half *p);
	void __ovld vstore_half2_rte(float2 data, size_t offset, __local half *p);
	void __ovld vstore_half3_rte(float3 data, size_t offset, __local half *p);
	void __ovld vstore_half4_rte(float4 data, size_t offset, __local half *p);
	void __ovld vstore_half8_rte(float8 data, size_t offset, __local half *p);
	void __ovld vstore_half16_rte(float16 data, size_t offset, __local half *p);
	void __ovld vstore_half2_rtz(float2 data, size_t offset, __local half *p);
	void __ovld vstore_half3_rtz(float3 data, size_t offset, __local half *p);
	void __ovld vstore_half4_rtz(float4 data, size_t offset, __local half *p);
	void __ovld vstore_half8_rtz(float8 data, size_t offset, __local half *p);
	void __ovld vstore_half16_rtz(float16 data, size_t offset, __local half *p);
	void __ovld vstore_half2_rtp(float2 data, size_t offset, __local half *p);
	void __ovld vstore_half3_rtp(float3 data, size_t offset, __local half *p);
	void __ovld vstore_half4_rtp(float4 data, size_t offset, __local half *p);
	void __ovld vstore_half8_rtp(float8 data, size_t offset, __local half *p);
	void __ovld vstore_half16_rtp(float16 data, size_t offset, __local half *p);
	void __ovld vstore_half2_rtn(float2 data, size_t offset, __local half *p);
	void __ovld vstore_half3_rtn(float3 data, size_t offset, __local half *p);
	void __ovld vstore_half4_rtn(float4 data, size_t offset, __local half *p);
	void __ovld vstore_half8_rtn(float8 data, size_t offset, __local half *p);
	void __ovld vstore_half16_rtn(float16 data, size_t offset, __local half *p);
	void __ovld vstore_half2(float2 data, size_t offset, __private half *p);
	void __ovld vstore_half3(float3 data, size_t offset, __private half *p);
	void __ovld vstore_half4(float4 data, size_t offset, __private half *p);
	void __ovld vstore_half8(float8 data, size_t offset, __private half *p);
	void __ovld vstore_half16(float16 data, size_t offset, __private half *p);
	void __ovld vstore_half2_rte(float2 data, size_t offset, __private half *p);
	void __ovld vstore_half3_rte(float3 data, size_t offset, __private half *p);
	void __ovld vstore_half4_rte(float4 data, size_t offset, __private half *p);
	void __ovld vstore_half8_rte(float8 data, size_t offset, __private half *p);
	void __ovld vstore_half16_rte(float16 data, size_t offset, __private half *p);
	void __ovld vstore_half2_rtz(float2 data, size_t offset, __private half *p);
	void __ovld vstore_half3_rtz(float3 data, size_t offset, __private half *p);
	void __ovld vstore_half4_rtz(float4 data, size_t offset, __private half *p);
	void __ovld vstore_half8_rtz(float8 data, size_t offset, __private half *p);
	void __ovld vstore_half16_rtz(float16 data, size_t offset, __private half *p);
	void __ovld vstore_half2_rtp(float2 data, size_t offset, __private half *p);
	void __ovld vstore_half3_rtp(float3 data, size_t offset, __private half *p);
	void __ovld vstore_half4_rtp(float4 data, size_t offset, __private half *p);
	void __ovld vstore_half8_rtp(float8 data, size_t offset, __private half *p);
	void __ovld vstore_half16_rtp(float16 data, size_t offset, __private half *p);
	void __ovld vstore_half2_rtn(float2 data, size_t offset, __private half *p);
	void __ovld vstore_half3_rtn(float3 data, size_t offset, __private half *p);
	void __ovld vstore_half4_rtn(float4 data, size_t offset, __private half *p);
	void __ovld vstore_half8_rtn(float8 data, size_t offset, __private half *p);
	void __ovld vstore_half16_rtn(float16 data, size_t offset, __private half *p);
	#ifdef cl_khr_fp64
	void __ovld vstore_half2(double2 data, size_t offset, __global half *p);
	void __ovld vstore_half3(double3 data, size_t offset, __global half *p);
	void __ovld vstore_half4(double4 data, size_t offset, __global half *p);
	void __ovld vstore_half8(double8 data, size_t offset, __global half *p);
	void __ovld vstore_half16(double16 data, size_t offset, __global half *p);
	void __ovld vstore_half2_rte(double2 data, size_t offset, __global half *p);
	void __ovld vstore_half3_rte(double3 data, size_t offset, __global half *p);
	void __ovld vstore_half4_rte(double4 data, size_t offset, __global half *p);
	void __ovld vstore_half8_rte(double8 data, size_t offset, __global half *p);
	void __ovld vstore_half16_rte(double16 data, size_t offset, __global half *p);
	void __ovld vstore_half2_rtz(double2 data, size_t offset, __global half *p);
	void __ovld vstore_half3_rtz(double3 data, size_t offset, __global half *p);
	void __ovld vstore_half4_rtz(double4 data, size_t offset, __global half *p);
	void __ovld vstore_half8_rtz(double8 data, size_t offset, __global half *p);
	void __ovld vstore_half16_rtz(double16 data, size_t offset, __global half *p);
	void __ovld vstore_half2_rtp(double2 data, size_t offset, __global half *p);
	void __ovld vstore_half3_rtp(double3 data, size_t offset, __global half *p);
	void __ovld vstore_half4_rtp(double4 data, size_t offset, __global half *p);
	void __ovld vstore_half8_rtp(double8 data, size_t offset, __global half *p);
	void __ovld vstore_half16_rtp(double16 data, size_t offset, __global half *p);
	void __ovld vstore_half2_rtn(double2 data, size_t offset, __global half *p);
	void __ovld vstore_half3_rtn(double3 data, size_t offset, __global half *p);
	void __ovld vstore_half4_rtn(double4 data, size_t offset, __global half *p);
	void __ovld vstore_half8_rtn(double8 data, size_t offset, __global half *p);
	void __ovld vstore_half16_rtn(double16 data, size_t offset, __global half *p);
	void __ovld vstore_half2(double2 data, size_t offset, __local half *p);
	void __ovld vstore_half3(double3 data, size_t offset, __local half *p);
	void __ovld vstore_half4(double4 data, size_t offset, __local half *p);
	void __ovld vstore_half8(double8 data, size_t offset, __local half *p);
	void __ovld vstore_half16(double16 data, size_t offset, __local half *p);
	void __ovld vstore_half2_rte(double2 data, size_t offset, __local half *p);
	void __ovld vstore_half3_rte(double3 data, size_t offset, __local half *p);
	void __ovld vstore_half4_rte(double4 data, size_t offset, __local half *p);
	void __ovld vstore_half8_rte(double8 data, size_t offset, __local half *p);
	void __ovld vstore_half16_rte(double16 data, size_t offset, __local half *p);
	void __ovld vstore_half2_rtz(double2 data, size_t offset, __local half *p);
	void __ovld vstore_half3_rtz(double3 data, size_t offset, __local half *p);
	void __ovld vstore_half4_rtz(double4 data, size_t offset, __local half *p);
	void __ovld vstore_half8_rtz(double8 data, size_t offset, __local half *p);
	void __ovld vstore_half16_rtz(double16 data, size_t offset, __local half *p);
	void __ovld vstore_half2_rtp(double2 data, size_t offset, __local half *p);
	void __ovld vstore_half3_rtp(double3 data, size_t offset, __local half *p);
	void __ovld vstore_half4_rtp(double4 data, size_t offset, __local half *p);
	void __ovld vstore_half8_rtp(double8 data, size_t offset, __local half *p);
	void __ovld vstore_half16_rtp(double16 data, size_t offset, __local half *p);
	void __ovld vstore_half2_rtn(double2 data, size_t offset, __local half *p);
	void __ovld vstore_half3_rtn(double3 data, size_t offset, __local half *p);
	void __ovld vstore_half4_rtn(double4 data, size_t offset, __local half *p);
	void __ovld vstore_half8_rtn(double8 data, size_t offset, __local half *p);
	void __ovld vstore_half16_rtn(double16 data, size_t offset, __local half *p);
	void __ovld vstore_half2(double2 data, size_t offset, __private half *p);
	void __ovld vstore_half3(double3 data, size_t offset, __private half *p);
	void __ovld vstore_half4(double4 data, size_t offset, __private half *p);
	void __ovld vstore_half8(double8 data, size_t offset, __private half *p);
	void __ovld vstore_half16(double16 data, size_t offset, __private half *p);
	void __ovld vstore_half2_rte(double2 data, size_t offset, __private half *p);
	void __ovld vstore_half3_rte(double3 data, size_t offset, __private half *p);
	void __ovld vstore_half4_rte(double4 data, size_t offset, __private half *p);
	void __ovld vstore_half8_rte(double8 data, size_t offset, __private half *p);
	void __ovld vstore_half16_rte(double16 data, size_t offset, __private half *p);
	void __ovld vstore_half2_rtz(double2 data, size_t offset, __private half *p);
	void __ovld vstore_half3_rtz(double3 data, size_t offset, __private half *p);
	void __ovld vstore_half4_rtz(double4 data, size_t offset, __private half *p);
	void __ovld vstore_half8_rtz(double8 data, size_t offset, __private half *p);
	void __ovld vstore_half16_rtz(double16 data, size_t offset, __private half *p);
	void __ovld vstore_half2_rtp(double2 data, size_t offset, __private half *p);
	void __ovld vstore_half3_rtp(double3 data, size_t offset, __private half *p);
	void __ovld vstore_half4_rtp(double4 data, size_t offset, __private half *p);
	void __ovld vstore_half8_rtp(double8 data, size_t offset, __private half *p);
	void __ovld vstore_half16_rtp(double16 data, size_t offset, __private half *p);
	void __ovld vstore_half2_rtn(double2 data, size_t offset, __private half *p);
	void __ovld vstore_half3_rtn(double3 data, size_t offset, __private half *p);
	void __ovld vstore_half4_rtn(double4 data, size_t offset, __private half *p);
	void __ovld vstore_half8_rtn(double8 data, size_t offset, __private half *p);
	void __ovld vstore_half16_rtn(double16 data, size_t offset, __private half *p);
	#endif //cl_khr_fp64
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* For n = 1, 2, 4, 8 and 16 read sizeof (halfn)
	* bytes of data from address (p + (offset * n)).
	* The data read is interpreted as a halfn value.
	* The halfn value read is converted to a floatn
	* value and the floatn value is returned.
	* The address computed as (p + (offset * n))
	* must be aligned to sizeof (halfn) bytes.
	* For n = 3, vloada_half3 reads a half3 from
	* address (p + (offset * 4)) and returns a float3.
	* The address computed as (p + (offset * 4))
	* must be aligned to sizeof (half) * 4 bytes.
	*/
	float __ovld vloada_half(size_t offset, const __constant half *p);
	float2 __ovld vloada_half2(size_t offset, const __constant half *p);
	float3 __ovld vloada_half3(size_t offset, const __constant half *p);
	float4 __ovld vloada_half4(size_t offset, const __constant half *p);
	float8 __ovld vloada_half8(size_t offset, const __constant half *p);
	float16 __ovld vloada_half16(size_t offset, const __constant half *p);
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float __ovld vloada_half(size_t offset, const half *p);
	float2 __ovld vloada_half2(size_t offset, const half *p);
	float3 __ovld vloada_half3(size_t offset, const half *p);
	float4 __ovld vloada_half4(size_t offset, const half *p);
	float8 __ovld vloada_half8(size_t offset, const half *p);
	float16 __ovld vloada_half16(size_t offset, const half *p);
	#else
	float __ovld vloada_half(size_t offset, const __global half *p);
	float2 __ovld vloada_half2(size_t offset, const __global half *p);
	float3 __ovld vloada_half3(size_t offset, const __global half *p);
	float4 __ovld vloada_half4(size_t offset, const __global half *p);
	float8 __ovld vloada_half8(size_t offset, const __global half *p);
	float16 __ovld vloada_half16(size_t offset, const __global half *p);
	float __ovld vloada_half(size_t offset, const __local half *p);
	float2 __ovld vloada_half2(size_t offset, const __local half *p);
	float3 __ovld vloada_half3(size_t offset, const __local half *p);
	float4 __ovld vloada_half4(size_t offset, const __local half *p);
	float8 __ovld vloada_half8(size_t offset, const __local half *p);
	float16 __ovld vloada_half16(size_t offset, const __local half *p);
	float __ovld vloada_half(size_t offset, const __private half *p);
	float2 __ovld vloada_half2(size_t offset, const __private half *p);
	float3 __ovld vloada_half3(size_t offset, const __private half *p);
	float4 __ovld vloada_half4(size_t offset, const __private half *p);
	float8 __ovld vloada_half8(size_t offset, const __private half *p);
	float16 __ovld vloada_half16(size_t offset, const __private half *p);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* The floatn value given by data is converted to
	* a halfn value using the appropriate rounding
	* mode.
	* For n = 1, 2, 4, 8 and 16, the halfn value is
	* written to the address computed as (p + (offset
	* * n)). The address computed as (p + (offset *
	* n)) must be aligned to sizeof (halfn) bytes.
	* For n = 3, the half3 value is written to the
	* address computed as (p + (offset * 4)). The
	* address computed as (p + (offset * 4)) must be
	* aligned to sizeof (half) * 4 bytes.
	* vstorea_halfn uses the current rounding
	* mode. The default current rounding mode is
	* round to nearest even.
	*/
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld vstorea_half(float data, size_t offset, half *p);
	void __ovld vstorea_half2(float2 data, size_t offset, half *p);
	void __ovld vstorea_half3(float3 data, size_t offset, half *p);
	void __ovld vstorea_half4(float4 data, size_t offset, half *p);
	void __ovld vstorea_half8(float8 data, size_t offset, half *p);
	void __ovld vstorea_half16(float16 data, size_t offset, half *p);

	void __ovld vstorea_half_rte(float data, size_t offset, half *p);
	void __ovld vstorea_half2_rte(float2 data, size_t offset, half *p);
	void __ovld vstorea_half3_rte(float3 data, size_t offset, half *p);
	void __ovld vstorea_half4_rte(float4 data, size_t offset, half *p);
	void __ovld vstorea_half8_rte(float8 data, size_t offset, half *p);
	void __ovld vstorea_half16_rte(float16 data, size_t offset, half *p);

	void __ovld vstorea_half_rtz(float data, size_t offset, half *p);
	void __ovld vstorea_half2_rtz(float2 data, size_t offset, half *p);
	void __ovld vstorea_half3_rtz(float3 data, size_t offset, half *p);
	void __ovld vstorea_half4_rtz(float4 data, size_t offset, half *p);
	void __ovld vstorea_half8_rtz(float8 data, size_t offset, half *p);
	void __ovld vstorea_half16_rtz(float16 data, size_t offset, half *p);

	void __ovld vstorea_half_rtp(float data, size_t offset, half *p);
	void __ovld vstorea_half2_rtp(float2 data, size_t offset, half *p);
	void __ovld vstorea_half3_rtp(float3 data, size_t offset, half *p);
	void __ovld vstorea_half4_rtp(float4 data, size_t offset, half *p);
	void __ovld vstorea_half8_rtp(float8 data, size_t offset, half *p);
	void __ovld vstorea_half16_rtp(float16 data, size_t offset, half *p);

	void __ovld vstorea_half_rtn(float data, size_t offset, half *p);
	void __ovld vstorea_half2_rtn(float2 data, size_t offset, half *p);
	void __ovld vstorea_half3_rtn(float3 data, size_t offset, half *p);
	void __ovld vstorea_half4_rtn(float4 data, size_t offset, half *p);
	void __ovld vstorea_half8_rtn(float8 data, size_t offset, half *p);
	void __ovld vstorea_half16_rtn(float16 data, size_t offset, half *p);

	#ifdef cl_khr_fp64
	void __ovld vstorea_half(double data, size_t offset, half *p);
	void __ovld vstorea_half2(double2 data, size_t offset, half *p);
	void __ovld vstorea_half3(double3 data, size_t offset, half *p);
	void __ovld vstorea_half4(double4 data, size_t offset, half *p);
	void __ovld vstorea_half8(double8 data, size_t offset, half *p);
	void __ovld vstorea_half16(double16 data, size_t offset, half *p);

	void __ovld vstorea_half_rte(double data, size_t offset, half *p);
	void __ovld vstorea_half2_rte(double2 data, size_t offset, half *p);
	void __ovld vstorea_half3_rte(double3 data, size_t offset, half *p);
	void __ovld vstorea_half4_rte(double4 data, size_t offset, half *p);
	void __ovld vstorea_half8_rte(double8 data, size_t offset, half *p);
	void __ovld vstorea_half16_rte(double16 data, size_t offset, half *p);

	void __ovld vstorea_half_rtz(double data, size_t offset, half *p);
	void __ovld vstorea_half2_rtz(double2 data, size_t offset, half *p);
	void __ovld vstorea_half3_rtz(double3 data, size_t offset, half *p);
	void __ovld vstorea_half4_rtz(double4 data, size_t offset, half *p);
	void __ovld vstorea_half8_rtz(double8 data, size_t offset, half *p);
	void __ovld vstorea_half16_rtz(double16 data, size_t offset, half *p);

	void __ovld vstorea_half_rtp(double data, size_t offset, half *p);
	void __ovld vstorea_half2_rtp(double2 data, size_t offset, half *p);
	void __ovld vstorea_half3_rtp(double3 data, size_t offset, half *p);
	void __ovld vstorea_half4_rtp(double4 data, size_t offset, half *p);
	void __ovld vstorea_half8_rtp(double8 data, size_t offset, half *p);
	void __ovld vstorea_half16_rtp(double16 data, size_t offset, half *p);

	void __ovld vstorea_half_rtn(double data, size_t offset, half *p);
	void __ovld vstorea_half2_rtn(double2 data, size_t offset, half *p);
	void __ovld vstorea_half3_rtn(double3 data, size_t offset, half *p);
	void __ovld vstorea_half4_rtn(double4 data, size_t offset, half *p);
	void __ovld vstorea_half8_rtn(double8 data, size_t offset, half *p);
	void __ovld vstorea_half16_rtn(double16 data, size_t offset, half *p);
	#endif //cl_khr_fp64

	#else
	void __ovld vstorea_half(float data, size_t offset, __global half *p);
	void __ovld vstorea_half2(float2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3(float3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4(float4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8(float8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16(float16 data, size_t offset, __global half *p);

	void __ovld vstorea_half_rte(float data, size_t offset, __global half *p);
	void __ovld vstorea_half2_rte(float2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3_rte(float3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4_rte(float4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8_rte(float8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16_rte(float16 data, size_t offset, __global half *p);

	void __ovld vstorea_half_rtz(float data, size_t offset, __global half *p);
	void __ovld vstorea_half2_rtz(float2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3_rtz(float3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4_rtz(float4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8_rtz(float8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16_rtz(float16 data, size_t offset, __global half *p);

	void __ovld vstorea_half_rtp(float data, size_t offset, __global half *p);
	void __ovld vstorea_half2_rtp(float2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3_rtp(float3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4_rtp(float4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8_rtp(float8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16_rtp(float16 data, size_t offset, __global half *p);

	void __ovld vstorea_half_rtn(float data, size_t offset, __global half *p);
	void __ovld vstorea_half2_rtn(float2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3_rtn(float3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4_rtn(float4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8_rtn(float8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16_rtn(float16 data, size_t offset, __global half *p);

	void __ovld vstorea_half(float data, size_t offset, __local half *p);
	void __ovld vstorea_half2(float2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3(float3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4(float4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8(float8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16(float16 data, size_t offset, __local half *p);

	void __ovld vstorea_half_rte(float data, size_t offset, __local half *p);
	void __ovld vstorea_half2_rte(float2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3_rte(float3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4_rte(float4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8_rte(float8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16_rte(float16 data, size_t offset, __local half *p);

	void __ovld vstorea_half_rtz(float data, size_t offset, __local half *p);
	void __ovld vstorea_half2_rtz(float2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3_rtz(float3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4_rtz(float4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8_rtz(float8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16_rtz(float16 data, size_t offset, __local half *p);

	void __ovld vstorea_half_rtp(float data, size_t offset, __local half *p);
	void __ovld vstorea_half2_rtp(float2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3_rtp(float3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4_rtp(float4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8_rtp(float8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16_rtp(float16 data, size_t offset, __local half *p);

	void __ovld vstorea_half_rtn(float data, size_t offset, __local half *p);
	void __ovld vstorea_half2_rtn(float2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3_rtn(float3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4_rtn(float4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8_rtn(float8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16_rtn(float16 data, size_t offset, __local half *p);

	void __ovld vstorea_half(float data, size_t offset, __private half *p);
	void __ovld vstorea_half2(float2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3(float3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4(float4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8(float8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16(float16 data, size_t offset, __private half *p);

	void __ovld vstorea_half_rte(float data, size_t offset, __private half *p);
	void __ovld vstorea_half2_rte(float2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3_rte(float3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4_rte(float4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8_rte(float8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16_rte(float16 data, size_t offset, __private half *p);

	void __ovld vstorea_half_rtz(float data, size_t offset, __private half *p);
	void __ovld vstorea_half2_rtz(float2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3_rtz(float3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4_rtz(float4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8_rtz(float8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16_rtz(float16 data, size_t offset, __private half *p);

	void __ovld vstorea_half_rtp(float data, size_t offset, __private half *p);
	void __ovld vstorea_half2_rtp(float2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3_rtp(float3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4_rtp(float4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8_rtp(float8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16_rtp(float16 data, size_t offset, __private half *p);

	void __ovld vstorea_half_rtn(float data, size_t offset, __private half *p);
	void __ovld vstorea_half2_rtn(float2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3_rtn(float3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4_rtn(float4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8_rtn(float8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16_rtn(float16 data, size_t offset, __private half *p);

	#ifdef cl_khr_fp64
	void __ovld vstorea_half(double data, size_t offset, __global half *p);
	void __ovld vstorea_half2(double2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3(double3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4(double4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8(double8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16(double16 data, size_t offset, __global half *p);

	void __ovld vstorea_half_rte(double data, size_t offset, __global half *p);
	void __ovld vstorea_half2_rte(double2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3_rte(double3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4_rte(double4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8_rte(double8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16_rte(double16 data, size_t offset, __global half *p);

	void __ovld vstorea_half_rtz(double data, size_t offset, __global half *p);
	void __ovld vstorea_half2_rtz(double2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3_rtz(double3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4_rtz(double4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8_rtz(double8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16_rtz(double16 data, size_t offset, __global half *p);

	void __ovld vstorea_half_rtp(double data, size_t offset, __global half *p);
	void __ovld vstorea_half2_rtp(double2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3_rtp(double3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4_rtp(double4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8_rtp(double8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16_rtp(double16 data, size_t offset, __global half *p);

	void __ovld vstorea_half_rtn(double data, size_t offset, __global half *p);
	void __ovld vstorea_half2_rtn(double2 data, size_t offset, __global half *p);
	void __ovld vstorea_half3_rtn(double3 data, size_t offset, __global half *p);
	void __ovld vstorea_half4_rtn(double4 data, size_t offset, __global half *p);
	void __ovld vstorea_half8_rtn(double8 data, size_t offset, __global half *p);
	void __ovld vstorea_half16_rtn(double16 data, size_t offset, __global half *p);

	void __ovld vstorea_half(double data, size_t offset, __local half *p);
	void __ovld vstorea_half2(double2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3(double3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4(double4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8(double8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16(double16 data, size_t offset, __local half *p);

	void __ovld vstorea_half_rte(double data, size_t offset, __local half *p);
	void __ovld vstorea_half2_rte(double2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3_rte(double3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4_rte(double4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8_rte(double8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16_rte(double16 data, size_t offset, __local half *p);

	void __ovld vstorea_half_rtz(double data, size_t offset, __local half *p);
	void __ovld vstorea_half2_rtz(double2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3_rtz(double3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4_rtz(double4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8_rtz(double8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16_rtz(double16 data, size_t offset, __local half *p);

	void __ovld vstorea_half_rtp(double data, size_t offset, __local half *p);
	void __ovld vstorea_half2_rtp(double2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3_rtp(double3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4_rtp(double4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8_rtp(double8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16_rtp(double16 data, size_t offset, __local half *p);

	void __ovld vstorea_half_rtn(double data, size_t offset, __local half *p);
	void __ovld vstorea_half2_rtn(double2 data, size_t offset, __local half *p);
	void __ovld vstorea_half3_rtn(double3 data, size_t offset, __local half *p);
	void __ovld vstorea_half4_rtn(double4 data, size_t offset, __local half *p);
	void __ovld vstorea_half8_rtn(double8 data, size_t offset, __local half *p);
	void __ovld vstorea_half16_rtn(double16 data, size_t offset, __local half *p);

	void __ovld vstorea_half(double data, size_t offset, __private half *p);
	void __ovld vstorea_half2(double2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3(double3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4(double4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8(double8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16(double16 data, size_t offset, __private half *p);

	void __ovld vstorea_half_rte(double data, size_t offset, __private half *p);
	void __ovld vstorea_half2_rte(double2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3_rte(double3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4_rte(double4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8_rte(double8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16_rte(double16 data, size_t offset, __private half *p);

	void __ovld vstorea_half_rtz(double data, size_t offset, __private half *p);
	void __ovld vstorea_half2_rtz(double2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3_rtz(double3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4_rtz(double4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8_rtz(double8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16_rtz(double16 data, size_t offset, __private half *p);

	void __ovld vstorea_half_rtp(double data, size_t offset, __private half *p);
	void __ovld vstorea_half2_rtp(double2 data, size_t offset, __private half *p);
	void __ovld vstorea_half3_rtp(double3 data, size_t offset, __private half *p);
	void __ovld vstorea_half4_rtp(double4 data, size_t offset, __private half *p);
	void __ovld vstorea_half8_rtp(double8 data, size_t offset, __private half *p);
	void __ovld vstorea_half16_rtp(double16 data, size_t offset, __private half *p);

	void __ovld vstorea_half_rtn(double data, size_t offset, __private half *p);
	void __ovld vstorea_half2_rtn(double2 data,size_t offset, __private half *p);
	void __ovld vstorea_half3_rtn(double3 data,size_t offset, __private half *p);
	void __ovld vstorea_half4_rtn(double4 data,size_t offset, __private half *p);
	void __ovld vstorea_half8_rtn(double8 data,size_t offset, __private half *p);
	void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
	#endif //cl_khr_fp64
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions

	/**
	* All work-items in a work-group executing the kernel
	* on a processor must execute this function before any
	* are allowed to continue execution beyond the barrier.
	* This function must be encountered by all work-items in
	* a work-group executing the kernel.
	* If barrier is inside a conditional statement, then all
	* work-items must enter the conditional if any work-item
	* enters the conditional statement and executes the
	* barrier.
	* If barrer is inside a loop, all work-items must execute
	* the barrier for each iteration of the loop before any are
	* allowed to continue execution beyond the barrier.
	* The barrier function also queues a memory fence
	* (reads and writes) to ensure correct ordering of
	* memory operations to local or global memory.
	* The flags argument specifies the memory address space
	* and can be set to a combination of the following literal
	* values.
	* CLK_LOCAL_MEM_FENCE - The barrier function
	* will either flush any variables stored in local memory
	* or queue a memory fence to ensure correct ordering of
	* memory operations to local memory.
	* CLK_GLOBAL_MEM_FENCE - The barrier function
	* will queue a memory fence to ensure correct ordering
	* of memory operations to global memory. This can be
	* useful when work-items, for example, write to buffer or
	* image objects and then want to read the updated data.
	*/

	void __ovld __conv barrier(cl_mem_fence_flags flags);

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
	void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions

	/**
	* Orders loads and stores of a work-item
	* executing a kernel. This means that loads
	* and stores preceding the mem_fence will
	* be committed to memory before any loads
	* and stores following the mem_fence.
	* The flags argument specifies the memory
	* address space and can be set to a
	* combination of the following literal
	* values:
	* CLK_LOCAL_MEM_FENCE
	* CLK_GLOBAL_MEM_FENCE.
	*/
	void __ovld mem_fence(cl_mem_fence_flags flags);

	/**
	* Read memory barrier that orders only
	* loads.
	* The flags argument specifies the memory
	* address space and can be set to a
	* combination of the following literal
	* values:
	* CLK_LOCAL_MEM_FENCE
	* CLK_GLOBAL_MEM_FENCE.
	*/
	void __ovld read_mem_fence(cl_mem_fence_flags flags);

	/**
	* Write memory barrier that orders only
	* stores.
	* The flags argument specifies the memory
	* address space and can be set to a
	* combination of the following literal
	* values:
	* CLK_LOCAL_MEM_FENCE
	* CLK_GLOBAL_MEM_FENCE.
	*/
	void __ovld write_mem_fence(cl_mem_fence_flags flags);

	// OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	cl_mem_fence_flags __ovld get_fence(const void *ptr);
	cl_mem_fence_flags __ovld get_fence(void *ptr);

	/**
	* Builtin functions to_global, to_local, and to_private need to be declared as Clang builtin functions
	* and checked in Sema since they should be declared as
	* addr gentype* to_addr (gentype*);
	* where gentype is builtin type or user defined type.
	*/

	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// OpenCL v1.1 s6.11.10, v1.2 s6.12.10, v2.0 s6.13.10 - Async Copies from Global to Local Memory, Local to Global Memory, and Prefetch

	/**
	* event_t async_work_group_copy (
	* __global gentype *dst,
	* const __local gentype *src,
	* size_t num_elements,
	* event_t event)
	* Perform an async copy of num_elements
	* gentype elements from src to dst. The async
	* copy is performed by all work-items in a workgroup
	* and this built-in function must therefore
	* be encountered by all work-items in a workgroup
	* executing the kernel with the same
	* argument values; otherwise the results are
	* undefined.
	* Returns an event object that can be used by
	* wait_group_events to wait for the async copy
	* to finish. The event argument can also be used
	* to associate the async_work_group_copy with
	* a previous async copy allowing an event to be
	* shared by multiple async copies; otherwise event
	* should be zero.
	* If event argument is non-zero, the event object
	* supplied in event argument will be returned.
	* This function does not perform any implicit
	* synchronization of source data such as using a
	* barrier before performing the copy.
	*/
	event_t __ovld async_work_group_copy(__local char dst, const __global char src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uchar dst, const __global uchar src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local short dst, const __global short src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ushort dst, const __global ushort src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local int dst, const __global int src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uint dst, const __global uint src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local long dst, const __global long src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ulong dst, const __global ulong src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local float dst, const __global float src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local char2 dst, const __global char2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uchar2 dst, const __global uchar2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local short2 dst, const __global short2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ushort2 dst, const __global ushort2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local int2 dst, const __global int2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uint2 dst, const __global uint2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local long2 dst, const __global long2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ulong2 dst, const __global ulong2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local float2 dst, const __global float2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local char3 dst, const __global char3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uchar3 dst, const __global uchar3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local short3 dst, const __global short3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ushort3 dst, const __global ushort3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local int3 dst, const __global int3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uint3 dst, const __global uint3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local long3 dst, const __global long3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ulong3 dst, const __global ulong3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local float3 dst, const __global float3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local char4 dst, const __global char4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uchar4 dst, const __global uchar4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local short4 dst, const __global short4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ushort4 dst, const __global ushort4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local int4 dst, const __global int4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uint4 dst, const __global uint4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local long4 dst, const __global long4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ulong4 dst, const __global ulong4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local float4 dst, const __global float4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local char8 dst, const __global char8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uchar8 dst, const __global uchar8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local short8 dst, const __global short8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ushort8 dst, const __global ushort8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local int8 dst, const __global int8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uint8 dst, const __global uint8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local long8 dst, const __global long8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ulong8 dst, const __global ulong8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local float8 dst, const __global float8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local char16 dst, const __global char16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uchar16 dst, const __global uchar16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local short16 dst, const __global short16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ushort16 dst, const __global ushort16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local int16 dst, const __global int16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local uint16 dst, const __global uint16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local long16 dst, const __global long16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local ulong16 dst, const __global ulong16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local float16 dst, const __global float16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global char dst, const __local char src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uchar dst, const __local uchar src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global short dst, const __local short src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ushort dst, const __local ushort src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global int dst, const __local int src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uint dst, const __local uint src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global long dst, const __local long src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ulong dst, const __local ulong src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global float dst, const __local float src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global char2 dst, const __local char2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uchar2 dst, const __local uchar2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global short2 dst, const __local short2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ushort2 dst, const __local ushort2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global int2 dst, const __local int2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uint2 dst, const __local uint2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global long2 dst, const __local long2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ulong2 dst, const __local ulong2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global float2 dst, const __local float2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global char3 dst, const __local char3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uchar3 dst, const __local uchar3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global short3 dst, const __local short3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ushort3 dst, const __local ushort3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global int3 dst, const __local int3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uint3 dst, const __local uint3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global long3 dst, const __local long3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ulong3 dst, const __local ulong3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global float3 dst, const __local float3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global char4 dst, const __local char4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uchar4 dst, const __local uchar4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global short4 dst, const __local short4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ushort4 dst, const __local ushort4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global int4 dst, const __local int4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uint4 dst, const __local uint4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global long4 dst, const __local long4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ulong4 dst, const __local ulong4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global float4 dst, const __local float4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global char8 dst, const __local char8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uchar8 dst, const __local uchar8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global short8 dst, const __local short8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ushort8 dst, const __local ushort8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global int8 dst, const __local int8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uint8 dst, const __local uint8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global long8 dst, const __local long8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ulong8 dst, const __local ulong8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global float8 dst, const __local float8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global char16 dst, const __local char16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uchar16 dst, const __local uchar16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global short16 dst, const __local short16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ushort16 dst, const __local ushort16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global int16 dst, const __local int16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global uint16 dst, const __local uint16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global long16 dst, const __local long16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global ulong16 dst, const __local ulong16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global float16 dst, const __local float16 src, size_t num_elements, event_t event);
	#ifdef cl_khr_fp64
	event_t __ovld async_work_group_copy(__local double dst, const __global double src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local double2 dst, const __global double2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local double3 dst, const __global double3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local double4 dst, const __global double4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local double8 dst, const __global double8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local double16 dst, const __global double16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global double dst, const __local double src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global double2 dst, const __local double2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global double3 dst, const __local double3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global double4 dst, const __local double4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global double8 dst, const __local double8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global double16 dst, const __local double16 src, size_t num_elements, event_t event);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	event_t __ovld async_work_group_copy(__local half dst, const __global half src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local half2 dst, const __global half2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local half3 dst, const __global half3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local half4 dst, const __global half4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local half8 dst, const __global half8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__local half16 dst, const __global half16 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global half dst, const __local half src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global half2 dst, const __local half2 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global half3 dst, const __local half3 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global half4 dst, const __local half4 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global half8 dst, const __local half8 src, size_t num_elements, event_t event);
	event_t __ovld async_work_group_copy(__global half16 dst, const __local half16 src, size_t num_elements, event_t event);
	#endif //cl_khr_fp16

	/**
	* Perform an async gather of num_elements
	* gentype elements from src to dst. The
	* src_stride is the stride in elements for each
	* gentype element read from src. The dst_stride
	* is the stride in elements for each gentype
	* element written to dst. The async gather is
	* performed by all work-items in a work-group.
	* This built-in function must therefore be
	* encountered by all work-items in a work-group
	* executing the kernel with the same argument
	* values; otherwise the results are undefined.
	* Returns an event object that can be used by
	* wait_group_events to wait for the async copy
	* to finish. The event argument can also be used
	* to associate the
	* async_work_group_strided_copy with a
	* previous async copy allowing an event to be
	* shared by multiple async copies; otherwise event
	* should be zero.
	* If event argument is non-zero, the event object
	* supplied in event argument will be returned.
	* This function does not perform any implicit
	* synchronization of source data such as using a
	* barrier before performing the copy.
	*/
	event_t __ovld async_work_group_strided_copy(__local char dst, const __global char src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uchar dst, const __global uchar src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local short dst, const __global short src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ushort dst, const __global ushort src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local int dst, const __global int src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uint dst, const __global uint src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local long dst, const __global long src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ulong dst, const __global ulong src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local float dst, const __global float src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local char2 dst, const __global char2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uchar2 dst, const __global uchar2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local short2 dst, const __global short2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ushort2 dst, const __global ushort2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local int2 dst, const __global int2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uint2 dst, const __global uint2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local long2 dst, const __global long2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ulong2 dst, const __global ulong2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local float2 dst, const __global float2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local char3 dst, const __global char3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uchar3 dst, const __global uchar3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local short3 dst, const __global short3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ushort3 dst, const __global ushort3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local int3 dst, const __global int3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uint3 dst, const __global uint3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local long3 dst, const __global long3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ulong3 dst, const __global ulong3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local float3 dst, const __global float3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local char4 dst, const __global char4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uchar4 dst, const __global uchar4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local short4 dst, const __global short4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ushort4 dst, const __global ushort4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local int4 dst, const __global int4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uint4 dst, const __global uint4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local long4 dst, const __global long4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ulong4 dst, const __global ulong4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local float4 dst, const __global float4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local char8 dst, const __global char8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uchar8 dst, const __global uchar8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local short8 dst, const __global short8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ushort8 dst, const __global ushort8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local int8 dst, const __global int8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uint8 dst, const __global uint8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local long8 dst, const __global long8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ulong8 dst, const __global ulong8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local float8 dst, const __global float8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local char16 dst, const __global char16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uchar16 dst, const __global uchar16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local short16 dst, const __global short16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ushort16 dst, const __global ushort16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local int16 dst, const __global int16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local uint16 dst, const __global uint16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local long16 dst, const __global long16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local ulong16 dst, const __global ulong16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local float16 dst, const __global float16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global char dst, const __local char src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uchar dst, const __local uchar src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global short dst, const __local short src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ushort dst, const __local ushort src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global int dst, const __local int src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uint dst, const __local uint src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global long dst, const __local long src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ulong dst, const __local ulong src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global float dst, const __local float src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global char2 dst, const __local char2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uchar2 dst, const __local uchar2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global short2 dst, const __local short2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ushort2 dst, const __local ushort2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global int2 dst, const __local int2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uint2 dst, const __local uint2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global long2 dst, const __local long2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ulong2 dst, const __local ulong2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global float2 dst, const __local float2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global char3 dst, const __local char3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uchar3 dst, const __local uchar3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global short3 dst, const __local short3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ushort3 dst, const __local ushort3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global int3 dst, const __local int3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uint3 dst, const __local uint3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global long3 dst, const __local long3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ulong3 dst, const __local ulong3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global float3 dst, const __local float3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global char4 dst, const __local char4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uchar4 dst, const __local uchar4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global short4 dst, const __local short4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ushort4 dst, const __local ushort4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global int4 dst, const __local int4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uint4 dst, const __local uint4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global long4 dst, const __local long4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ulong4 dst, const __local ulong4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global float4 dst, const __local float4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global char8 dst, const __local char8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uchar8 dst, const __local uchar8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global short8 dst, const __local short8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ushort8 dst, const __local ushort8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global int8 dst, const __local int8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uint8 dst, const __local uint8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global long8 dst, const __local long8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ulong8 dst, const __local ulong8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global float8 dst, const __local float8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global char16 dst, const __local char16 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uchar16 dst, const __local uchar16 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global short16 dst, const __local short16 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ushort16 dst, const __local ushort16 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global int16 dst, const __local int16 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global uint16 dst, const __local uint16 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global long16 dst, const __local long16 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global ulong16 dst, const __local ulong16 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global float16 dst, const __local float16 src, size_t num_elements, size_t dst_stride, event_t event);
	#ifdef cl_khr_fp64
	event_t __ovld async_work_group_strided_copy(__local double dst, const __global double src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local double2 dst, const __global double2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local double3 dst, const __global double3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local double4 dst, const __global double4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local double8 dst, const __global double8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local double16 dst, const __global double16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global double dst, const __local double src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global double2 dst, const __local double2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global double3 dst, const __local double3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global double4 dst, const __local double4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global double8 dst, const __local double8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global double16 dst, const __local double16 src, size_t num_elements, size_t dst_stride, event_t event);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	event_t __ovld async_work_group_strided_copy(__local half dst, const __global half src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local half2 dst, const __global half2 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local half3 dst, const __global half3 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local half4 dst, const __global half4 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local half8 dst, const __global half8 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__local half16 dst, const __global half16 src, size_t num_elements, size_t src_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global half dst, const __local half src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global half2 dst, const __local half2 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global half3 dst, const __local half3 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global half4 dst, const __local half4 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global half8 dst, const __local half8 src, size_t num_elements, size_t dst_stride, event_t event);
	event_t __ovld async_work_group_strided_copy(__global half16 dst, const __local half16 src, size_t num_elements, size_t dst_stride, event_t event);
	#endif //cl_khr_fp16

	/**
	* Wait for events that identify the
	* async_work_group_copy operations to
	* complete. The event objects specified in
	* event_list will be released after the wait is
	* performed.
	* This function must be encountered by all workitems
	* in a work-group executing the kernel with
	* the same num_events and event objects specified
	* in event_list; otherwise the results are undefined.
	*/
	void __ovld wait_group_events(int num_events, event_t *event_list);

	/**
	* Prefetch num_elements * sizeof(gentype)
	* bytes into the global cache. The prefetch
	* instruction is applied to a work-item in a workgroup
	* and does not affect the functional
	* behavior of the kernel.
	*/
	void __ovld prefetch(const __global char *p, size_t num_elements);
	void __ovld prefetch(const __global uchar *p, size_t num_elements);
	void __ovld prefetch(const __global short *p, size_t num_elements);
	void __ovld prefetch(const __global ushort *p, size_t num_elements);
	void __ovld prefetch(const __global int *p, size_t num_elements);
	void __ovld prefetch(const __global uint *p, size_t num_elements);
	void __ovld prefetch(const __global long *p, size_t num_elements);
	void __ovld prefetch(const __global ulong *p, size_t num_elements);
	void __ovld prefetch(const __global float *p, size_t num_elements);
	void __ovld prefetch(const __global char2 *p, size_t num_elements);
	void __ovld prefetch(const __global uchar2 *p, size_t num_elements);
	void __ovld prefetch(const __global short2 *p, size_t num_elements);
	void __ovld prefetch(const __global ushort2 *p, size_t num_elements);
	void __ovld prefetch(const __global int2 *p, size_t num_elements);
	void __ovld prefetch(const __global uint2 *p, size_t num_elements);
	void __ovld prefetch(const __global long2 *p, size_t num_elements);
	void __ovld prefetch(const __global ulong2 *p, size_t num_elements);
	void __ovld prefetch(const __global float2 *p, size_t num_elements);
	void __ovld prefetch(const __global char3 *p, size_t num_elements);
	void __ovld prefetch(const __global uchar3 *p, size_t num_elements);
	void __ovld prefetch(const __global short3 *p, size_t num_elements);
	void __ovld prefetch(const __global ushort3 *p, size_t num_elements);
	void __ovld prefetch(const __global int3 *p, size_t num_elements);
	void __ovld prefetch(const __global uint3 *p, size_t num_elements);
	void __ovld prefetch(const __global long3 *p, size_t num_elements);
	void __ovld prefetch(const __global ulong3 *p, size_t num_elements);
	void __ovld prefetch(const __global float3 *p, size_t num_elements);
	void __ovld prefetch(const __global char4 *p, size_t num_elements);
	void __ovld prefetch(const __global uchar4 *p, size_t num_elements);
	void __ovld prefetch(const __global short4 *p, size_t num_elements);
	void __ovld prefetch(const __global ushort4 *p, size_t num_elements);
	void __ovld prefetch(const __global int4 *p, size_t num_elements);
	void __ovld prefetch(const __global uint4 *p, size_t num_elements);
	void __ovld prefetch(const __global long4 *p, size_t num_elements);
	void __ovld prefetch(const __global ulong4 *p, size_t num_elements);
	void __ovld prefetch(const __global float4 *p, size_t num_elements);
	void __ovld prefetch(const __global char8 *p, size_t num_elements);
	void __ovld prefetch(const __global uchar8 *p, size_t num_elements);
	void __ovld prefetch(const __global short8 *p, size_t num_elements);
	void __ovld prefetch(const __global ushort8 *p, size_t num_elements);
	void __ovld prefetch(const __global int8 *p, size_t num_elements);
	void __ovld prefetch(const __global uint8 *p, size_t num_elements);
	void __ovld prefetch(const __global long8 *p, size_t num_elements);
	void __ovld prefetch(const __global ulong8 *p, size_t num_elements);
	void __ovld prefetch(const __global float8 *p, size_t num_elements);
	void __ovld prefetch(const __global char16 *p, size_t num_elements);
	void __ovld prefetch(const __global uchar16 *p, size_t num_elements);
	void __ovld prefetch(const __global short16 *p, size_t num_elements);
	void __ovld prefetch(const __global ushort16 *p, size_t num_elements);
	void __ovld prefetch(const __global int16 *p, size_t num_elements);
	void __ovld prefetch(const __global uint16 *p, size_t num_elements);
	void __ovld prefetch(const __global long16 *p, size_t num_elements);
	void __ovld prefetch(const __global ulong16 *p, size_t num_elements);
	void __ovld prefetch(const __global float16 *p, size_t num_elements);
	#ifdef cl_khr_fp64
	void __ovld prefetch(const __global double *p, size_t num_elements);
	void __ovld prefetch(const __global double2 *p, size_t num_elements);
	void __ovld prefetch(const __global double3 *p, size_t num_elements);
	void __ovld prefetch(const __global double4 *p, size_t num_elements);
	void __ovld prefetch(const __global double8 *p, size_t num_elements);
	void __ovld prefetch(const __global double16 *p, size_t num_elements);
	#endif //cl_khr_fp64
	#ifdef cl_khr_fp16
	void __ovld prefetch(const __global half *p, size_t num_elements);
	void __ovld prefetch(const __global half2 *p, size_t num_elements);
	void __ovld prefetch(const __global half3 *p, size_t num_elements);
	void __ovld prefetch(const __global half4 *p, size_t num_elements);
	void __ovld prefetch(const __global half8 *p, size_t num_elements);
	void __ovld prefetch(const __global half16 *p, size_t num_elements);
	#endif // cl_khr_fp16

	// OpenCL v1.1 s6.11.1, v1.2 s6.12.11 - Atomic Functions

	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
	#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
	#endif
	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* (old + val) and store result at location
	* pointed by p. The function returns old.
	*/
	int __ovld atomic_add(volatile __global int *p, int val);
	unsigned int __ovld atomic_add(volatile __global unsigned int *p, unsigned int val);
	int __ovld atomic_add(volatile __local int *p, int val);
	unsigned int __ovld atomic_add(volatile __local unsigned int *p, unsigned int val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_add(volatile int *p, int val);
	unsigned int __ovld atomic_add(volatile unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_global_int32_base_atomics)
	int __ovld atom_add(volatile __global int *p, int val);
	unsigned int __ovld atom_add(volatile __global unsigned int *p, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_base_atomics)
	int __ovld atom_add(volatile __local int *p, int val);
	unsigned int __ovld atom_add(volatile __local unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_int64_base_atomics)
	long __ovld atom_add(volatile __global long *p, long val);
	unsigned long __ovld atom_add(volatile __global unsigned long *p, unsigned long val);
	long __ovld atom_add(volatile __local long *p, long val);
	unsigned long __ovld atom_add(volatile __local unsigned long *p, unsigned long val);
	#endif

	/**
	* Read the 32-bit value (referred to as old) stored at location pointed by p.
	* Compute (old - val) and store result at location pointed by p. The function
	* returns old.
	*/
	int __ovld atomic_sub(volatile __global int *p, int val);
	unsigned int __ovld atomic_sub(volatile __global unsigned int *p, unsigned int val);
	int __ovld atomic_sub(volatile __local int *p, int val);
	unsigned int __ovld atomic_sub(volatile __local unsigned int *p, unsigned int val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_sub(volatile int *p, int val);
	unsigned int __ovld atomic_sub(volatile unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_global_int32_base_atomics)
	int __ovld atom_sub(volatile __global int *p, int val);
	unsigned int __ovld atom_sub(volatile __global unsigned int *p, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_base_atomics)
	int __ovld atom_sub(volatile __local int *p, int val);
	unsigned int __ovld atom_sub(volatile __local unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_int64_base_atomics)
	long __ovld atom_sub(volatile __global long *p, long val);
	unsigned long __ovld atom_sub(volatile __global unsigned long *p, unsigned long val);
	long __ovld atom_sub(volatile __local long *p, long val);
	unsigned long __ovld atom_sub(volatile __local unsigned long *p, unsigned long val);
	#endif

	/**
	* Swaps the old value stored at location p
	* with new value given by val. Returns old
	* value.
	*/
	int __ovld atomic_xchg(volatile __global int *p, int val);
	unsigned int __ovld atomic_xchg(volatile __global unsigned int *p, unsigned int val);
	int __ovld atomic_xchg(volatile __local int *p, int val);
	unsigned int __ovld atomic_xchg(volatile __local unsigned int *p, unsigned int val);
	float __ovld atomic_xchg(volatile __global float *p, float val);
	float __ovld atomic_xchg(volatile __local float *p, float val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_xchg(volatile int *p, int val);
	unsigned int __ovld atomic_xchg(volatile unsigned int *p, unsigned int val);
	float __ovld atomic_xchg(volatile float *p, float val);
	#endif

	#if defined(cl_khr_global_int32_base_atomics)
	int __ovld atom_xchg(volatile __global int *p, int val);
	unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_base_atomics)
	int __ovld atom_xchg(volatile __local int *p, int val);
	unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_int64_base_atomics)
	long __ovld atom_xchg(volatile __global long *p, long val);
	long __ovld atom_xchg(volatile __local long *p, long val);
	unsigned long __ovld atom_xchg(volatile __global unsigned long *p, unsigned long val);
	unsigned long __ovld atom_xchg(volatile __local unsigned long *p, unsigned long val);
	#endif

	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* (old + 1) and store result at location
	* pointed by p. The function returns old.
	*/
	int __ovld atomic_inc(volatile __global int *p);
	unsigned int __ovld atomic_inc(volatile __global unsigned int *p);
	int __ovld atomic_inc(volatile __local int *p);
	unsigned int __ovld atomic_inc(volatile __local unsigned int *p);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_inc(volatile int *p);
	unsigned int __ovld atomic_inc(volatile unsigned int *p);
	#endif

	#if defined(cl_khr_global_int32_base_atomics)
	int __ovld atom_inc(volatile __global int *p);
	unsigned int __ovld atom_inc(volatile __global unsigned int *p);
	#endif
	#if defined(cl_khr_local_int32_base_atomics)
	int __ovld atom_inc(volatile __local int *p);
	unsigned int __ovld atom_inc(volatile __local unsigned int *p);
	#endif

	#if defined(cl_khr_int64_base_atomics)
	long __ovld atom_inc(volatile __global long *p);
	unsigned long __ovld atom_inc(volatile __global unsigned long *p);
	long __ovld atom_inc(volatile __local long *p);
	unsigned long __ovld atom_inc(volatile __local unsigned long *p);
	#endif

	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* (old - 1) and store result at location
	* pointed by p. The function returns old.
	*/
	int __ovld atomic_dec(volatile __global int *p);
	unsigned int __ovld atomic_dec(volatile __global unsigned int *p);
	int __ovld atomic_dec(volatile __local int *p);
	unsigned int __ovld atomic_dec(volatile __local unsigned int *p);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_dec(volatile int *p);
	unsigned int __ovld atomic_dec(volatile unsigned int *p);
	#endif

	#if defined(cl_khr_global_int32_base_atomics)
	int __ovld atom_dec(volatile __global int *p);
	unsigned int __ovld atom_dec(volatile __global unsigned int *p);
	#endif
	#if defined(cl_khr_local_int32_base_atomics)
	int __ovld atom_dec(volatile __local int *p);
	unsigned int __ovld atom_dec(volatile __local unsigned int *p);
	#endif

	#if defined(cl_khr_int64_base_atomics)
	long __ovld atom_dec(volatile __global long *p);
	unsigned long __ovld atom_dec(volatile __global unsigned long *p);
	long __ovld atom_dec(volatile __local long *p);
	unsigned long __ovld atom_dec(volatile __local unsigned long *p);
	#endif

	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* (old == cmp) ? val : old and store result at
	* location pointed by p. The function
	* returns old.
	*/
	int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
	unsigned int __ovld atomic_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
	int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
	unsigned int __ovld atomic_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val);
	unsigned int __ovld atomic_cmpxchg(volatile unsigned int *p, unsigned int cmp, unsigned int val);
	#endif

	#if defined(cl_khr_global_int32_base_atomics)
	int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
	unsigned int __ovld atom_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_base_atomics)
	int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
	unsigned int __ovld atom_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
	#endif

	#if defined(cl_khr_int64_base_atomics)
	long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
	unsigned long __ovld atom_cmpxchg(volatile __global unsigned long *p, unsigned long cmp, unsigned long val);
	long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
	unsigned long __ovld atom_cmpxchg(volatile __local unsigned long *p, unsigned long cmp, unsigned long val);
	#endif

	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* min(old, val) and store minimum value at
	* location pointed by p. The function
	* returns old.
	*/
	int __ovld atomic_min(volatile __global int *p, int val);
	unsigned int __ovld atomic_min(volatile __global unsigned int *p, unsigned int val);
	int __ovld atomic_min(volatile __local int *p, int val);
	unsigned int __ovld atomic_min(volatile __local unsigned int *p, unsigned int val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_min(volatile int *p, int val);
	unsigned int __ovld atomic_min(volatile unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_global_int32_extended_atomics)
	int __ovld atom_min(volatile __global int *p, int val);
	unsigned int __ovld atom_min(volatile __global unsigned int *p, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_extended_atomics)
	int __ovld atom_min(volatile __local int *p, int val);
	unsigned int __ovld atom_min(volatile __local unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_int64_extended_atomics)
	long __ovld atom_min(volatile __global long *p, long val);
	unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val);
	long __ovld atom_min(volatile __local long *p, long val);
	unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val);
	#endif

	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* max(old, val) and store maximum value at
	* location pointed by p. The function
	* returns old.
	*/
	int __ovld atomic_max(volatile __global int *p, int val);
	unsigned int __ovld atomic_max(volatile __global unsigned int *p, unsigned int val);
	int __ovld atomic_max(volatile __local int *p, int val);
	unsigned int __ovld atomic_max(volatile __local unsigned int *p, unsigned int val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_max(volatile int *p, int val);
	unsigned int __ovld atomic_max(volatile unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_global_int32_extended_atomics)
	int __ovld atom_max(volatile __global int *p, int val);
	unsigned int __ovld atom_max(volatile __global unsigned int *p, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_extended_atomics)
	int __ovld atom_max(volatile __local int *p, int val);
	unsigned int __ovld atom_max(volatile __local unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_int64_extended_atomics)
	long __ovld atom_max(volatile __global long *p, long val);
	unsigned long __ovld atom_max(volatile __global unsigned long *p, unsigned long val);
	long __ovld atom_max(volatile __local long *p, long val);
	unsigned long __ovld atom_max(volatile __local unsigned long *p, unsigned long val);
	#endif

	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* (old & val) and store result at location
	* pointed by p. The function returns old.
	*/
	int __ovld atomic_and(volatile __global int *p, int val);
	unsigned int __ovld atomic_and(volatile __global unsigned int *p, unsigned int val);
	int __ovld atomic_and(volatile __local int *p, int val);
	unsigned int __ovld atomic_and(volatile __local unsigned int *p, unsigned int val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_and(volatile int *p, int val);
	unsigned int __ovld atomic_and(volatile unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_global_int32_extended_atomics)
	int __ovld atom_and(volatile __global int *p, int val);
	unsigned int __ovld atom_and(volatile __global unsigned int *p, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_extended_atomics)
	int __ovld atom_and(volatile __local int *p, int val);
	unsigned int __ovld atom_and(volatile __local unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_int64_extended_atomics)
	long __ovld atom_and(volatile __global long *p, long val);
	unsigned long __ovld atom_and(volatile __global unsigned long *p, unsigned long val);
	long __ovld atom_and(volatile __local long *p, long val);
	unsigned long __ovld atom_and(volatile __local unsigned long *p, unsigned long val);
	#endif

	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* (old \| val) and store result at location
	* pointed by p. The function returns old.
	*/
	int __ovld atomic_or(volatile __global int *p, int val);
	unsigned int __ovld atomic_or(volatile __global unsigned int *p, unsigned int val);
	int __ovld atomic_or(volatile __local int *p, int val);
	unsigned int __ovld atomic_or(volatile __local unsigned int *p, unsigned int val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_or(volatile int *p, int val);
	unsigned int __ovld atomic_or(volatile unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_global_int32_extended_atomics)
	int __ovld atom_or(volatile __global int *p, int val);
	unsigned int __ovld atom_or(volatile __global unsigned int *p, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_extended_atomics)
	int __ovld atom_or(volatile __local int *p, int val);
	unsigned int __ovld atom_or(volatile __local unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_int64_extended_atomics)
	long __ovld atom_or(volatile __global long *p, long val);
	unsigned long __ovld atom_or(volatile __global unsigned long *p, unsigned long val);
	long __ovld atom_or(volatile __local long *p, long val);
	unsigned long __ovld atom_or(volatile __local unsigned long *p, unsigned long val);
	#endif

	/**
	* Read the 32-bit value (referred to as old)
	* stored at location pointed by p. Compute
	* (old ^ val) and store result at location
	* pointed by p. The function returns old.
	*/
	int __ovld atomic_xor(volatile __global int *p, int val);
	unsigned int __ovld atomic_xor(volatile __global unsigned int *p, unsigned int val);
	int __ovld atomic_xor(volatile __local int *p, int val);
	unsigned int __ovld atomic_xor(volatile __local unsigned int *p, unsigned int val);
	#ifdef __OPENCL_CPP_VERSION__
	int __ovld atomic_xor(volatile int *p, int val);
	unsigned int __ovld atomic_xor(volatile unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_global_int32_extended_atomics)
	int __ovld atom_xor(volatile __global int *p, int val);
	unsigned int __ovld atom_xor(volatile __global unsigned int *p, unsigned int val);
	#endif
	#if defined(cl_khr_local_int32_extended_atomics)
	int __ovld atom_xor(volatile __local int *p, int val);
	unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val);
	#endif

	#if defined(cl_khr_int64_extended_atomics)
	long __ovld atom_xor(volatile __global long *p, long val);
	unsigned long __ovld atom_xor(volatile __global unsigned long *p, unsigned long val);
	long __ovld atom_xor(volatile __local long *p, long val);
	unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long val);
	#endif

	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
	#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
	#endif

	// OpenCL v2.0 s6.13.11 - Atomics Functions

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
	#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
	#endif

	// atomic_init()
	void __ovld atomic_init(volatile atomic_int *object, int value);
	void __ovld atomic_init(volatile atomic_uint *object, uint value);
	void __ovld atomic_init(volatile atomic_float *object, float value);
	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	void __ovld atomic_init(volatile atomic_long *object, long value);
	void __ovld atomic_init(volatile atomic_ulong *object, ulong value);
	#ifdef cl_khr_fp64
	void __ovld atomic_init(volatile atomic_double *object, double value);
	#endif //cl_khr_fp64
	#endif

	// atomic_work_item_fence()
	void __ovld atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);

	// atomic_fetch()

	int __ovld atomic_fetch_add(volatile atomic_int *object, int operand);
	int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order);
	int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_add(volatile atomic_uint *object, uint operand);
	uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order);
	uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
	int __ovld atomic_fetch_sub(volatile atomic_int *object, int operand);
	int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order);
	int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_sub(volatile atomic_uint *object, uint operand);
	uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order);
	uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
	int __ovld atomic_fetch_or(volatile atomic_int *object, int operand);
	int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order);
	int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_or(volatile atomic_uint *object, uint operand);
	uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order);
	uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
	int __ovld atomic_fetch_xor(volatile atomic_int *object, int operand);
	int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order);
	int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_xor(volatile atomic_uint *object, uint operand);
	uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order);
	uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
	int __ovld atomic_fetch_and(volatile atomic_int *object, int operand);
	int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order);
	int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_and(volatile atomic_uint *object, uint operand);
	uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order);
	uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
	int __ovld atomic_fetch_min(volatile atomic_int *object, int operand);
	int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order);
	int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_min(volatile atomic_uint *object, uint operand);
	uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order);
	uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_min(volatile atomic_uint *object, int operand);
	uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order);
	uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
	int __ovld atomic_fetch_max(volatile atomic_int *object, int operand);
	int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order);
	int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_max(volatile atomic_uint *object, uint operand);
	uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order);
	uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
	uint __ovld atomic_fetch_max(volatile atomic_uint *object, int operand);
	uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order);
	uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);

	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	long __ovld atomic_fetch_add(volatile atomic_long *object, long operand);
	long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order);
	long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_add(volatile atomic_ulong *object, ulong operand);
	ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
	ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
	long __ovld atomic_fetch_sub(volatile atomic_long *object, long operand);
	long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order);
	long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_sub(volatile atomic_ulong *object, ulong operand);
	ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
	ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
	long __ovld atomic_fetch_or(volatile atomic_long *object, long operand);
	long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order);
	long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_or(volatile atomic_ulong *object, ulong operand);
	ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
	ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
	long __ovld atomic_fetch_xor(volatile atomic_long *object, long operand);
	long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order);
	long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_xor(volatile atomic_ulong *object, ulong operand);
	ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
	ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
	long __ovld atomic_fetch_and(volatile atomic_long *object, long operand);
	long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order);
	long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_and(volatile atomic_ulong *object, ulong operand);
	ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
	ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
	long __ovld atomic_fetch_min(volatile atomic_long *object, long operand);
	long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order);
	long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, ulong operand);
	ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
	ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, long operand);
	ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order);
	ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
	long __ovld atomic_fetch_max(volatile atomic_long *object, long operand);
	long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order);
	long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, ulong operand);
	ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
	ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
	ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, long operand);
	ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order);
	ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
	#endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)

	// OpenCL v2.0 s6.13.11.7.5:
	// add/sub: atomic type argument can be uintptr_t/intptr_t, value type argument can be ptrdiff_t.
	// or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t.

	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand);
	uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
	uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
	uintptr_t __ovld atomic_fetch_sub(volatile atomic_uintptr_t *object, ptrdiff_t operand);
	uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
	uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);

	uintptr_t __ovld atomic_fetch_or(volatile atomic_uintptr_t *object, intptr_t operand);
	uintptr_t __ovld atomic_fetch_or_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
	uintptr_t __ovld atomic_fetch_or_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
	uintptr_t __ovld atomic_fetch_xor(volatile atomic_uintptr_t *object, intptr_t operand);
	uintptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
	uintptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
	uintptr_t __ovld atomic_fetch_and(volatile atomic_uintptr_t *object, intptr_t operand);
	uintptr_t __ovld atomic_fetch_and_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
	uintptr_t __ovld atomic_fetch_and_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
	uintptr_t __ovld atomic_fetch_min(volatile atomic_uintptr_t *object, intptr_t opermax);
	uintptr_t __ovld atomic_fetch_min_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder);
	uintptr_t __ovld atomic_fetch_min_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder, memory_scope scope);
	uintptr_t __ovld atomic_fetch_max(volatile atomic_uintptr_t *object, intptr_t opermax);
	uintptr_t __ovld atomic_fetch_max_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder);
	uintptr_t __ovld atomic_fetch_max_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder, memory_scope scope);

	intptr_t __ovld atomic_fetch_or(volatile atomic_intptr_t *object, uintptr_t operand);
	intptr_t __ovld atomic_fetch_or_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
	intptr_t __ovld atomic_fetch_or_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
	intptr_t __ovld atomic_fetch_xor(volatile atomic_intptr_t *object, uintptr_t operand);
	intptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
	intptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
	intptr_t __ovld atomic_fetch_and(volatile atomic_intptr_t *object, uintptr_t operand);
	intptr_t __ovld atomic_fetch_and_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
	intptr_t __ovld atomic_fetch_and_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
	intptr_t __ovld atomic_fetch_min(volatile atomic_intptr_t *object, uintptr_t opermax);
	intptr_t __ovld atomic_fetch_min_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder);
	intptr_t __ovld atomic_fetch_min_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder, memory_scope scope);
	intptr_t __ovld atomic_fetch_max(volatile atomic_intptr_t *object, uintptr_t opermax);
	intptr_t __ovld atomic_fetch_max_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder);
	intptr_t __ovld atomic_fetch_max_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder, memory_scope scope);
	#endif

	// atomic_store()

	void __ovld atomic_store(volatile atomic_int *object, int desired);
	void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order);
	void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
	void __ovld atomic_store(volatile atomic_uint *object, uint desired);
	void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order);
	void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
	void __ovld atomic_store(volatile atomic_float *object, float desired);
	void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order);
	void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	#ifdef cl_khr_fp64
	void __ovld atomic_store(volatile atomic_double *object, double desired);
	void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order);
	void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
	#endif //cl_khr_fp64
	void __ovld atomic_store(volatile atomic_long *object, long desired);
	void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order);
	void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
	void __ovld atomic_store(volatile atomic_ulong *object, ulong desired);
	void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
	void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
	#endif

	// atomic_load()

	int __ovld atomic_load(volatile atomic_int *object);
	int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order);
	int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order, memory_scope scope);
	uint __ovld atomic_load(volatile atomic_uint *object);
	uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order);
	uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order, memory_scope scope);
	float __ovld atomic_load(volatile atomic_float *object);
	float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order);
	float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order, memory_scope scope);
	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	#ifdef cl_khr_fp64
	double __ovld atomic_load(volatile atomic_double *object);
	double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order);
	double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order, memory_scope scope);
	#endif //cl_khr_fp64
	long __ovld atomic_load(volatile atomic_long *object);
	long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order);
	long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order, memory_scope scope);
	ulong __ovld atomic_load(volatile atomic_ulong *object);
	ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order);
	ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order, memory_scope scope);
	#endif

	// atomic_exchange()

	int __ovld atomic_exchange(volatile atomic_int *object, int desired);
	int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order);
	int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
	uint __ovld atomic_exchange(volatile atomic_uint *object, uint desired);
	uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order);
	uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
	float __ovld atomic_exchange(volatile atomic_float *object, float desired);
	float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order);
	float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	#ifdef cl_khr_fp64
	double __ovld atomic_exchange(volatile atomic_double *object, double desired);
	double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order);
	double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
	#endif //cl_khr_fp64
	long __ovld atomic_exchange(volatile atomic_long *object, long desired);
	long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order);
	long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
	ulong __ovld atomic_exchange(volatile atomic_ulong *object, ulong desired);
	ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
	ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
	#endif

	// atomic_compare_exchange_strong() and atomic_compare_exchange_weak()

	bool __ovld atomic_compare_exchange_strong(volatile atomic_int object, int expected, int desired);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int object, int expected,
	int desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int object, int expected,
	int desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_strong(volatile atomic_uint object, uint expected, uint desired);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint object, uint expected,
	uint desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint object, uint expected,
	uint desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_weak(volatile atomic_int object, int expected, int desired);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int object, int expected,
	int desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int object, int expected,
	int desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_weak(volatile atomic_uint object, uint expected, uint desired);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint object, uint expected,
	uint desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint object, uint expected,
	uint desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_strong(volatile atomic_float object, float expected, float desired);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float object, float expected,
	float desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float object, float expected,
	float desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_weak(volatile atomic_float object, float expected, float desired);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float object, float expected,
	float desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float object, float expected,
	float desired, memory_order success, memory_order failure, memory_scope scope);
	#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
	#ifdef cl_khr_fp64
	bool __ovld atomic_compare_exchange_strong(volatile atomic_double object, double expected, double desired);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double object, double expected,
	double desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double object, double expected,
	double desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_weak(volatile atomic_double object, double expected, double desired);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double object, double expected,
	double desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double object, double expected,
	double desired, memory_order success, memory_order failure, memory_scope scope);
	#endif //cl_khr_fp64
	bool __ovld atomic_compare_exchange_strong(volatile atomic_long object, long expected, long desired);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long object, long expected,
	long desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long object, long expected,
	long desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_weak(volatile atomic_long object, long expected, long desired);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long object, long expected,
	long desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long object, long expected,
	long desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_strong(volatile atomic_ulong object, ulong expected, ulong desired);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong object, ulong expected,
	ulong desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong object, ulong expected,
	ulong desired, memory_order success, memory_order failure, memory_scope scope);
	bool __ovld atomic_compare_exchange_weak(volatile atomic_ulong object, ulong expected, ulong desired);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong object, ulong expected,
	ulong desired, memory_order success, memory_order failure);
	bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong object, ulong expected,
	ulong desired, memory_order success, memory_order failure, memory_scope scope);
	#endif

	// atomic_flag_test_and_set() and atomic_flag_clear()

	bool __ovld atomic_flag_test_and_set(volatile atomic_flag *object);
	bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
	bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
	void __ovld atomic_flag_clear(volatile atomic_flag *object);
	void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
	void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);

	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// OpenCL v1.1 s6.11.12, v1.2 s6.12.12, v2.0 s6.13.12 - Miscellaneous Vector Functions

	/**
	* The shuffle and shuffle2 built-in functions construct
	* a permutation of elements from one or two input
	* vectors respectively that are of the same type,
	* returning a vector with the same element type as the
	* input and length that is the same as the shuffle mask.
	* The size of each element in the mask must match the
	* size of each element in the result. For shuffle, only
	* the ilogb(2m-1) least significant bits of each mask
	* element are considered. For shuffle2, only the
	* ilogb(2m-1)+1 least significant bits of each mask
	* element are considered. Other bits in the mask shall
	* be ignored.
	* The elements of the input vectors are numbered from
	* left to right across one or both of the vectors. For this
	* purpose, the number of elements in a vector is given
	* by vec_step(gentypem). The shuffle mask operand
	* specifies, for each element of the result vector, which
	* element of the one or two input vectors the result
	* element gets.
	* Examples:
	* uint4 mask = (uint4)(3, 2,
	* 1, 0);
	* float4 a;
	* float4 r = shuffle(a, mask);
	* // r.s0123 = a.wzyx
	* uint8 mask = (uint8)(0, 1, 2, 3,
	* 4, 5, 6, 7);
	* float4 a, b;
	* float8 r = shuffle2(a, b, mask);
	* // r.s0123 = a.xyzw
	* // r.s4567 = b.xyzw
	* uint4 mask;
	* float8 a;
	* float4 b;
	* b = shuffle(a, mask);
	* Examples that are not valid are:
	* uint8 mask;
	* short16 a;
	* short8 b;
	* b = shuffle(a, mask); <- not valid
	*/
	char2 __ovld __cnfn shuffle(char2 x, uchar2 mask);
	char2 __ovld __cnfn shuffle(char4 x, uchar2 mask);
	char2 __ovld __cnfn shuffle(char8 x, uchar2 mask);
	char2 __ovld __cnfn shuffle(char16 x, uchar2 mask);

	uchar2 __ovld __cnfn shuffle(uchar2 x, uchar2 mask);
	uchar2 __ovld __cnfn shuffle(uchar4 x, uchar2 mask);
	uchar2 __ovld __cnfn shuffle(uchar8 x, uchar2 mask);
	uchar2 __ovld __cnfn shuffle(uchar16 x, uchar2 mask);

	short2 __ovld __cnfn shuffle(short2 x, ushort2 mask);
	short2 __ovld __cnfn shuffle(short4 x, ushort2 mask);
	short2 __ovld __cnfn shuffle(short8 x, ushort2 mask);
	short2 __ovld __cnfn shuffle(short16 x, ushort2 mask);

	ushort2 __ovld __cnfn shuffle(ushort2 x, ushort2 mask);
	ushort2 __ovld __cnfn shuffle(ushort4 x, ushort2 mask);
	ushort2 __ovld __cnfn shuffle(ushort8 x, ushort2 mask);
	ushort2 __ovld __cnfn shuffle(ushort16 x, ushort2 mask);

	int2 __ovld __cnfn shuffle(int2 x, uint2 mask);
	int2 __ovld __cnfn shuffle(int4 x, uint2 mask);
	int2 __ovld __cnfn shuffle(int8 x, uint2 mask);
	int2 __ovld __cnfn shuffle(int16 x, uint2 mask);

	uint2 __ovld __cnfn shuffle(uint2 x, uint2 mask);
	uint2 __ovld __cnfn shuffle(uint4 x, uint2 mask);
	uint2 __ovld __cnfn shuffle(uint8 x, uint2 mask);
	uint2 __ovld __cnfn shuffle(uint16 x, uint2 mask);

	long2 __ovld __cnfn shuffle(long2 x, ulong2 mask);
	long2 __ovld __cnfn shuffle(long4 x, ulong2 mask);
	long2 __ovld __cnfn shuffle(long8 x, ulong2 mask);
	long2 __ovld __cnfn shuffle(long16 x, ulong2 mask);

	ulong2 __ovld __cnfn shuffle(ulong2 x, ulong2 mask);
	ulong2 __ovld __cnfn shuffle(ulong4 x, ulong2 mask);
	ulong2 __ovld __cnfn shuffle(ulong8 x, ulong2 mask);
	ulong2 __ovld __cnfn shuffle(ulong16 x, ulong2 mask);

	float2 __ovld __cnfn shuffle(float2 x, uint2 mask);
	float2 __ovld __cnfn shuffle(float4 x, uint2 mask);
	float2 __ovld __cnfn shuffle(float8 x, uint2 mask);
	float2 __ovld __cnfn shuffle(float16 x, uint2 mask);

	char4 __ovld __cnfn shuffle(char2 x, uchar4 mask);
	char4 __ovld __cnfn shuffle(char4 x, uchar4 mask);
	char4 __ovld __cnfn shuffle(char8 x, uchar4 mask);
	char4 __ovld __cnfn shuffle(char16 x, uchar4 mask);

	uchar4 __ovld __cnfn shuffle(uchar2 x, uchar4 mask);
	uchar4 __ovld __cnfn shuffle(uchar4 x, uchar4 mask);
	uchar4 __ovld __cnfn shuffle(uchar8 x, uchar4 mask);
	uchar4 __ovld __cnfn shuffle(uchar16 x, uchar4 mask);

	short4 __ovld __cnfn shuffle(short2 x, ushort4 mask);
	short4 __ovld __cnfn shuffle(short4 x, ushort4 mask);
	short4 __ovld __cnfn shuffle(short8 x, ushort4 mask);
	short4 __ovld __cnfn shuffle(short16 x, ushort4 mask);

	ushort4 __ovld __cnfn shuffle(ushort2 x, ushort4 mask);
	ushort4 __ovld __cnfn shuffle(ushort4 x, ushort4 mask);
	ushort4 __ovld __cnfn shuffle(ushort8 x, ushort4 mask);
	ushort4 __ovld __cnfn shuffle(ushort16 x, ushort4 mask);

	int4 __ovld __cnfn shuffle(int2 x, uint4 mask);
	int4 __ovld __cnfn shuffle(int4 x, uint4 mask);
	int4 __ovld __cnfn shuffle(int8 x, uint4 mask);
	int4 __ovld __cnfn shuffle(int16 x, uint4 mask);

	uint4 __ovld __cnfn shuffle(uint2 x, uint4 mask);
	uint4 __ovld __cnfn shuffle(uint4 x, uint4 mask);
	uint4 __ovld __cnfn shuffle(uint8 x, uint4 mask);
	uint4 __ovld __cnfn shuffle(uint16 x, uint4 mask);

	long4 __ovld __cnfn shuffle(long2 x, ulong4 mask);
	long4 __ovld __cnfn shuffle(long4 x, ulong4 mask);
	long4 __ovld __cnfn shuffle(long8 x, ulong4 mask);
	long4 __ovld __cnfn shuffle(long16 x, ulong4 mask);

	ulong4 __ovld __cnfn shuffle(ulong2 x, ulong4 mask);
	ulong4 __ovld __cnfn shuffle(ulong4 x, ulong4 mask);
	ulong4 __ovld __cnfn shuffle(ulong8 x, ulong4 mask);
	ulong4 __ovld __cnfn shuffle(ulong16 x, ulong4 mask);

	float4 __ovld __cnfn shuffle(float2 x, uint4 mask);
	float4 __ovld __cnfn shuffle(float4 x, uint4 mask);
	float4 __ovld __cnfn shuffle(float8 x, uint4 mask);
	float4 __ovld __cnfn shuffle(float16 x, uint4 mask);

	char8 __ovld __cnfn shuffle(char2 x, uchar8 mask);
	char8 __ovld __cnfn shuffle(char4 x, uchar8 mask);
	char8 __ovld __cnfn shuffle(char8 x, uchar8 mask);
	char8 __ovld __cnfn shuffle(char16 x, uchar8 mask);

	uchar8 __ovld __cnfn shuffle(uchar2 x, uchar8 mask);
	uchar8 __ovld __cnfn shuffle(uchar4 x, uchar8 mask);
	uchar8 __ovld __cnfn shuffle(uchar8 x, uchar8 mask);
	uchar8 __ovld __cnfn shuffle(uchar16 x, uchar8 mask);

	short8 __ovld __cnfn shuffle(short2 x, ushort8 mask);
	short8 __ovld __cnfn shuffle(short4 x, ushort8 mask);
	short8 __ovld __cnfn shuffle(short8 x, ushort8 mask);
	short8 __ovld __cnfn shuffle(short16 x, ushort8 mask);

	ushort8 __ovld __cnfn shuffle(ushort2 x, ushort8 mask);
	ushort8 __ovld __cnfn shuffle(ushort4 x, ushort8 mask);
	ushort8 __ovld __cnfn shuffle(ushort8 x, ushort8 mask);
	ushort8 __ovld __cnfn shuffle(ushort16 x, ushort8 mask);

	int8 __ovld __cnfn shuffle(int2 x, uint8 mask);
	int8 __ovld __cnfn shuffle(int4 x, uint8 mask);
	int8 __ovld __cnfn shuffle(int8 x, uint8 mask);
	int8 __ovld __cnfn shuffle(int16 x, uint8 mask);

	uint8 __ovld __cnfn shuffle(uint2 x, uint8 mask);
	uint8 __ovld __cnfn shuffle(uint4 x, uint8 mask);
	uint8 __ovld __cnfn shuffle(uint8 x, uint8 mask);
	uint8 __ovld __cnfn shuffle(uint16 x, uint8 mask);

	long8 __ovld __cnfn shuffle(long2 x, ulong8 mask);
	long8 __ovld __cnfn shuffle(long4 x, ulong8 mask);
	long8 __ovld __cnfn shuffle(long8 x, ulong8 mask);
	long8 __ovld __cnfn shuffle(long16 x, ulong8 mask);

	ulong8 __ovld __cnfn shuffle(ulong2 x, ulong8 mask);
	ulong8 __ovld __cnfn shuffle(ulong4 x, ulong8 mask);
	ulong8 __ovld __cnfn shuffle(ulong8 x, ulong8 mask);
	ulong8 __ovld __cnfn shuffle(ulong16 x, ulong8 mask);

	float8 __ovld __cnfn shuffle(float2 x, uint8 mask);
	float8 __ovld __cnfn shuffle(float4 x, uint8 mask);
	float8 __ovld __cnfn shuffle(float8 x, uint8 mask);
	float8 __ovld __cnfn shuffle(float16 x, uint8 mask);

	char16 __ovld __cnfn shuffle(char2 x, uchar16 mask);
	char16 __ovld __cnfn shuffle(char4 x, uchar16 mask);
	char16 __ovld __cnfn shuffle(char8 x, uchar16 mask);
	char16 __ovld __cnfn shuffle(char16 x, uchar16 mask);

	uchar16 __ovld __cnfn shuffle(uchar2 x, uchar16 mask);
	uchar16 __ovld __cnfn shuffle(uchar4 x, uchar16 mask);
	uchar16 __ovld __cnfn shuffle(uchar8 x, uchar16 mask);
	uchar16 __ovld __cnfn shuffle(uchar16 x, uchar16 mask);

	short16 __ovld __cnfn shuffle(short2 x, ushort16 mask);
	short16 __ovld __cnfn shuffle(short4 x, ushort16 mask);
	short16 __ovld __cnfn shuffle(short8 x, ushort16 mask);
	short16 __ovld __cnfn shuffle(short16 x, ushort16 mask);

	ushort16 __ovld __cnfn shuffle(ushort2 x, ushort16 mask);
	ushort16 __ovld __cnfn shuffle(ushort4 x, ushort16 mask);
	ushort16 __ovld __cnfn shuffle(ushort8 x, ushort16 mask);
	ushort16 __ovld __cnfn shuffle(ushort16 x, ushort16 mask);

	int16 __ovld __cnfn shuffle(int2 x, uint16 mask);
	int16 __ovld __cnfn shuffle(int4 x, uint16 mask);
	int16 __ovld __cnfn shuffle(int8 x, uint16 mask);
	int16 __ovld __cnfn shuffle(int16 x, uint16 mask);

	uint16 __ovld __cnfn shuffle(uint2 x, uint16 mask);
	uint16 __ovld __cnfn shuffle(uint4 x, uint16 mask);
	uint16 __ovld __cnfn shuffle(uint8 x, uint16 mask);
	uint16 __ovld __cnfn shuffle(uint16 x, uint16 mask);

	long16 __ovld __cnfn shuffle(long2 x, ulong16 mask);
	long16 __ovld __cnfn shuffle(long4 x, ulong16 mask);
	long16 __ovld __cnfn shuffle(long8 x, ulong16 mask);
	long16 __ovld __cnfn shuffle(long16 x, ulong16 mask);

	ulong16 __ovld __cnfn shuffle(ulong2 x, ulong16 mask);
	ulong16 __ovld __cnfn shuffle(ulong4 x, ulong16 mask);
	ulong16 __ovld __cnfn shuffle(ulong8 x, ulong16 mask);
	ulong16 __ovld __cnfn shuffle(ulong16 x, ulong16 mask);

	float16 __ovld __cnfn shuffle(float2 x, uint16 mask);
	float16 __ovld __cnfn shuffle(float4 x, uint16 mask);
	float16 __ovld __cnfn shuffle(float8 x, uint16 mask);
	float16 __ovld __cnfn shuffle(float16 x, uint16 mask);

	#ifdef cl_khr_fp64
	double2 __ovld __cnfn shuffle(double2 x, ulong2 mask);
	double2 __ovld __cnfn shuffle(double4 x, ulong2 mask);
	double2 __ovld __cnfn shuffle(double8 x, ulong2 mask);
	double2 __ovld __cnfn shuffle(double16 x, ulong2 mask);

	double4 __ovld __cnfn shuffle(double2 x, ulong4 mask);
	double4 __ovld __cnfn shuffle(double4 x, ulong4 mask);
	double4 __ovld __cnfn shuffle(double8 x, ulong4 mask);
	double4 __ovld __cnfn shuffle(double16 x, ulong4 mask);

	double8 __ovld __cnfn shuffle(double2 x, ulong8 mask);
	double8 __ovld __cnfn shuffle(double4 x, ulong8 mask);
	double8 __ovld __cnfn shuffle(double8 x, ulong8 mask);
	double8 __ovld __cnfn shuffle(double16 x, ulong8 mask);

	double16 __ovld __cnfn shuffle(double2 x, ulong16 mask);
	double16 __ovld __cnfn shuffle(double4 x, ulong16 mask);
	double16 __ovld __cnfn shuffle(double8 x, ulong16 mask);
	double16 __ovld __cnfn shuffle(double16 x, ulong16 mask);
	#endif //cl_khr_fp64

	#ifdef cl_khr_fp16
	half2 __ovld __cnfn shuffle(half2 x, ushort2 mask);
	half2 __ovld __cnfn shuffle(half4 x, ushort2 mask);
	half2 __ovld __cnfn shuffle(half8 x, ushort2 mask);
	half2 __ovld __cnfn shuffle(half16 x, ushort2 mask);

	half4 __ovld __cnfn shuffle(half2 x, ushort4 mask);
	half4 __ovld __cnfn shuffle(half4 x, ushort4 mask);
	half4 __ovld __cnfn shuffle(half8 x, ushort4 mask);
	half4 __ovld __cnfn shuffle(half16 x, ushort4 mask);

	half8 __ovld __cnfn shuffle(half2 x, ushort8 mask);
	half8 __ovld __cnfn shuffle(half4 x, ushort8 mask);
	half8 __ovld __cnfn shuffle(half8 x, ushort8 mask);
	half8 __ovld __cnfn shuffle(half16 x, ushort8 mask);

	half16 __ovld __cnfn shuffle(half2 x, ushort16 mask);
	half16 __ovld __cnfn shuffle(half4 x, ushort16 mask);
	half16 __ovld __cnfn shuffle(half8 x, ushort16 mask);
	half16 __ovld __cnfn shuffle(half16 x, ushort16 mask);
	#endif //cl_khr_fp16

	char2 __ovld __cnfn shuffle2(char2 x, char2 y, uchar2 mask);
	char2 __ovld __cnfn shuffle2(char4 x, char4 y, uchar2 mask);
	char2 __ovld __cnfn shuffle2(char8 x, char8 y, uchar2 mask);
	char2 __ovld __cnfn shuffle2(char16 x, char16 y, uchar2 mask);

	uchar2 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar2 mask);
	uchar2 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar2 mask);
	uchar2 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar2 mask);
	uchar2 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar2 mask);

	short2 __ovld __cnfn shuffle2(short2 x, short2 y, ushort2 mask);
	short2 __ovld __cnfn shuffle2(short4 x, short4 y, ushort2 mask);
	short2 __ovld __cnfn shuffle2(short8 x, short8 y, ushort2 mask);
	short2 __ovld __cnfn shuffle2(short16 x, short16 y, ushort2 mask);

	ushort2 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort2 mask);
	ushort2 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort2 mask);
	ushort2 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort2 mask);
	ushort2 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort2 mask);

	int2 __ovld __cnfn shuffle2(int2 x, int2 y, uint2 mask);
	int2 __ovld __cnfn shuffle2(int4 x, int4 y, uint2 mask);
	int2 __ovld __cnfn shuffle2(int8 x, int8 y, uint2 mask);
	int2 __ovld __cnfn shuffle2(int16 x, int16 y, uint2 mask);

	uint2 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint2 mask);
	uint2 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint2 mask);
	uint2 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint2 mask);
	uint2 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint2 mask);

	long2 __ovld __cnfn shuffle2(long2 x, long2 y, ulong2 mask);
	long2 __ovld __cnfn shuffle2(long4 x, long4 y, ulong2 mask);
	long2 __ovld __cnfn shuffle2(long8 x, long8 y, ulong2 mask);
	long2 __ovld __cnfn shuffle2(long16 x, long16 y, ulong2 mask);

	ulong2 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong2 mask);
	ulong2 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong2 mask);
	ulong2 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong2 mask);
	ulong2 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong2 mask);

	float2 __ovld __cnfn shuffle2(float2 x, float2 y, uint2 mask);
	float2 __ovld __cnfn shuffle2(float4 x, float4 y, uint2 mask);
	float2 __ovld __cnfn shuffle2(float8 x, float8 y, uint2 mask);
	float2 __ovld __cnfn shuffle2(float16 x, float16 y, uint2 mask);

	char4 __ovld __cnfn shuffle2(char2 x, char2 y, uchar4 mask);
	char4 __ovld __cnfn shuffle2(char4 x, char4 y, uchar4 mask);
	char4 __ovld __cnfn shuffle2(char8 x, char8 y, uchar4 mask);
	char4 __ovld __cnfn shuffle2(char16 x, char16 y, uchar4 mask);

	uchar4 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar4 mask);
	uchar4 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar4 mask);
	uchar4 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar4 mask);
	uchar4 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar4 mask);

	short4 __ovld __cnfn shuffle2(short2 x, short2 y, ushort4 mask);
	short4 __ovld __cnfn shuffle2(short4 x, short4 y, ushort4 mask);
	short4 __ovld __cnfn shuffle2(short8 x, short8 y, ushort4 mask);
	short4 __ovld __cnfn shuffle2(short16 x, short16 y, ushort4 mask);

	ushort4 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort4 mask);
	ushort4 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort4 mask);
	ushort4 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort4 mask);
	ushort4 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort4 mask);

	int4 __ovld __cnfn shuffle2(int2 x, int2 y, uint4 mask);
	int4 __ovld __cnfn shuffle2(int4 x, int4 y, uint4 mask);
	int4 __ovld __cnfn shuffle2(int8 x, int8 y, uint4 mask);
	int4 __ovld __cnfn shuffle2(int16 x, int16 y, uint4 mask);

	uint4 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint4 mask);
	uint4 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint4 mask);
	uint4 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint4 mask);
	uint4 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint4 mask);

	long4 __ovld __cnfn shuffle2(long2 x, long2 y, ulong4 mask);
	long4 __ovld __cnfn shuffle2(long4 x, long4 y, ulong4 mask);
	long4 __ovld __cnfn shuffle2(long8 x, long8 y, ulong4 mask);
	long4 __ovld __cnfn shuffle2(long16 x, long16 y, ulong4 mask);

	ulong4 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong4 mask);
	ulong4 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong4 mask);
	ulong4 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong4 mask);
	ulong4 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong4 mask);

	float4 __ovld __cnfn shuffle2(float2 x, float2 y, uint4 mask);
	float4 __ovld __cnfn shuffle2(float4 x, float4 y, uint4 mask);
	float4 __ovld __cnfn shuffle2(float8 x, float8 y, uint4 mask);
	float4 __ovld __cnfn shuffle2(float16 x, float16 y, uint4 mask);

	char8 __ovld __cnfn shuffle2(char2 x, char2 y, uchar8 mask);
	char8 __ovld __cnfn shuffle2(char4 x, char4 y, uchar8 mask);
	char8 __ovld __cnfn shuffle2(char8 x, char8 y, uchar8 mask);
	char8 __ovld __cnfn shuffle2(char16 x, char16 y, uchar8 mask);

	uchar8 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar8 mask);
	uchar8 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar8 mask);
	uchar8 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar8 mask);
	uchar8 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar8 mask);

	short8 __ovld __cnfn shuffle2(short2 x, short2 y, ushort8 mask);
	short8 __ovld __cnfn shuffle2(short4 x, short4 y, ushort8 mask);
	short8 __ovld __cnfn shuffle2(short8 x, short8 y, ushort8 mask);
	short8 __ovld __cnfn shuffle2(short16 x, short16 y, ushort8 mask);

	ushort8 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort8 mask);
	ushort8 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort8 mask);
	ushort8 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort8 mask);
	ushort8 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort8 mask);

	int8 __ovld __cnfn shuffle2(int2 x, int2 y, uint8 mask);
	int8 __ovld __cnfn shuffle2(int4 x, int4 y, uint8 mask);
	int8 __ovld __cnfn shuffle2(int8 x, int8 y, uint8 mask);
	int8 __ovld __cnfn shuffle2(int16 x, int16 y, uint8 mask);

	uint8 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint8 mask);
	uint8 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint8 mask);
	uint8 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint8 mask);
	uint8 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint8 mask);

	long8 __ovld __cnfn shuffle2(long2 x, long2 y, ulong8 mask);
	long8 __ovld __cnfn shuffle2(long4 x, long4 y, ulong8 mask);
	long8 __ovld __cnfn shuffle2(long8 x, long8 y, ulong8 mask);
	long8 __ovld __cnfn shuffle2(long16 x, long16 y, ulong8 mask);

	ulong8 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong8 mask);
	ulong8 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong8 mask);
	ulong8 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong8 mask);
	ulong8 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong8 mask);

	float8 __ovld __cnfn shuffle2(float2 x, float2 y, uint8 mask);
	float8 __ovld __cnfn shuffle2(float4 x, float4 y, uint8 mask);
	float8 __ovld __cnfn shuffle2(float8 x, float8 y, uint8 mask);
	float8 __ovld __cnfn shuffle2(float16 x, float16 y, uint8 mask);

	char16 __ovld __cnfn shuffle2(char2 x, char2 y, uchar16 mask);
	char16 __ovld __cnfn shuffle2(char4 x, char4 y, uchar16 mask);
	char16 __ovld __cnfn shuffle2(char8 x, char8 y, uchar16 mask);
	char16 __ovld __cnfn shuffle2(char16 x, char16 y, uchar16 mask);

	uchar16 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar16 mask);
	uchar16 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar16 mask);
	uchar16 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar16 mask);
	uchar16 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar16 mask);

	short16 __ovld __cnfn shuffle2(short2 x, short2 y, ushort16 mask);
	short16 __ovld __cnfn shuffle2(short4 x, short4 y, ushort16 mask);
	short16 __ovld __cnfn shuffle2(short8 x, short8 y, ushort16 mask);
	short16 __ovld __cnfn shuffle2(short16 x, short16 y, ushort16 mask);

	ushort16 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort16 mask);
	ushort16 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort16 mask);
	ushort16 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort16 mask);
	ushort16 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort16 mask);

	int16 __ovld __cnfn shuffle2(int2 x, int2 y, uint16 mask);
	int16 __ovld __cnfn shuffle2(int4 x, int4 y, uint16 mask);
	int16 __ovld __cnfn shuffle2(int8 x, int8 y, uint16 mask);
	int16 __ovld __cnfn shuffle2(int16 x, int16 y, uint16 mask);

	uint16 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint16 mask);
	uint16 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint16 mask);
	uint16 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint16 mask);
	uint16 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint16 mask);

	long16 __ovld __cnfn shuffle2(long2 x, long2 y, ulong16 mask);
	long16 __ovld __cnfn shuffle2(long4 x, long4 y, ulong16 mask);
	long16 __ovld __cnfn shuffle2(long8 x, long8 y, ulong16 mask);
	long16 __ovld __cnfn shuffle2(long16 x, long16 y, ulong16 mask);

	ulong16 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong16 mask);
	ulong16 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong16 mask);
	ulong16 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong16 mask);
	ulong16 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong16 mask);

	float16 __ovld __cnfn shuffle2(float2 x, float2 y, uint16 mask);
	float16 __ovld __cnfn shuffle2(float4 x, float4 y, uint16 mask);
	float16 __ovld __cnfn shuffle2(float8 x, float8 y, uint16 mask);
	float16 __ovld __cnfn shuffle2(float16 x, float16 y, uint16 mask);

	#ifdef cl_khr_fp64
	double2 __ovld __cnfn shuffle2(double2 x, double2 y, ulong2 mask);
	double2 __ovld __cnfn shuffle2(double4 x, double4 y, ulong2 mask);
	double2 __ovld __cnfn shuffle2(double8 x, double8 y, ulong2 mask);
	double2 __ovld __cnfn shuffle2(double16 x, double16 y, ulong2 mask);

	double4 __ovld __cnfn shuffle2(double2 x, double2 y, ulong4 mask);
	double4 __ovld __cnfn shuffle2(double4 x, double4 y, ulong4 mask);
	double4 __ovld __cnfn shuffle2(double8 x, double8 y, ulong4 mask);
	double4 __ovld __cnfn shuffle2(double16 x, double16 y, ulong4 mask);

	double8 __ovld __cnfn shuffle2(double2 x, double2 y, ulong8 mask);
	double8 __ovld __cnfn shuffle2(double4 x, double4 y, ulong8 mask);
	double8 __ovld __cnfn shuffle2(double8 x, double8 y, ulong8 mask);
	double8 __ovld __cnfn shuffle2(double16 x, double16 y, ulong8 mask);

	double16 __ovld __cnfn shuffle2(double2 x, double2 y, ulong16 mask);
	double16 __ovld __cnfn shuffle2(double4 x, double4 y, ulong16 mask);
	double16 __ovld __cnfn shuffle2(double8 x, double8 y, ulong16 mask);
	double16 __ovld __cnfn shuffle2(double16 x, double16 y, ulong16 mask);
	#endif //cl_khr_fp64

	#ifdef cl_khr_fp16
	half2 __ovld __cnfn shuffle2(half2 x, half2 y, ushort2 mask);
	half2 __ovld __cnfn shuffle2(half4 x, half4 y, ushort2 mask);
	half2 __ovld __cnfn shuffle2(half8 x, half8 y, ushort2 mask);
	half2 __ovld __cnfn shuffle2(half16 x, half16 y, ushort2 mask);

	half4 __ovld __cnfn shuffle2(half2 x, half2 y, ushort4 mask);
	half4 __ovld __cnfn shuffle2(half4 x, half4 y, ushort4 mask);
	half4 __ovld __cnfn shuffle2(half8 x, half8 y, ushort4 mask);
	half4 __ovld __cnfn shuffle2(half16 x, half16 y, ushort4 mask);

	half8 __ovld __cnfn shuffle2(half2 x, half2 y, ushort8 mask);
	half8 __ovld __cnfn shuffle2(half4 x, half4 y, ushort8 mask);
	half8 __ovld __cnfn shuffle2(half8 x, half8 y, ushort8 mask);
	half8 __ovld __cnfn shuffle2(half16 x, half16 y, ushort8 mask);

	half16 __ovld __cnfn shuffle2(half2 x, half2 y, ushort16 mask);
	half16 __ovld __cnfn shuffle2(half4 x, half4 y, ushort16 mask);
	half16 __ovld __cnfn shuffle2(half8 x, half8 y, ushort16 mask);
	half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
	#endif //cl_khr_fp16

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
	// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf

	int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
	#endif

	// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions

	#ifdef cl_khr_gl_msaa_sharing
	#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
	#endif //cl_khr_gl_msaa_sharing

	/**
	* Use the coordinate (coord.xy) to do an element lookup in
	* the 2D image object specified by image.
	*
	* Use the coordinate (coord.x, coord.y, coord.z) to do
	* an element lookup in the 3D image object specified
	* by image. coord.w is ignored.
	*
	* Use the coordinate (coord.z) to index into the
	* 2D image array object specified by image_array
	* and (coord.x, coord.y) to do an element lookup in
	* the 2D image object specified by image.
	*
	* Use the coordinate (x) to do an element lookup in
	* the 1D image object specified by image.
	*
	* Use the coordinate (coord.y) to index into the
	* 1D image array object specified by image_array
	* and (coord.x) to do an element lookup in
	* the 1D image object specified by image.
	*
	* Use the coordinate (cood.xy) and sample to do an
	* element lookup in the 2D multi-sample image specified
	* by image.
	*
	* Use coord.xy and sample to do an element
	* lookup in the 2D multi-sample image layer
	* identified by index coord.z in the 2D multi-sample
	* image array specified by image.
	*
	* For mipmap images, use the mip-level specified by
	* the Level-of-Detail (lod) or use gradients for LOD
	* computation.
	*
	* read_imagef returns floating-point values in the
	* range [0.0 ... 1.0] for image objects created with
	* image_channel_data_type set to one of the predefined
	* packed formats or CL_UNORM_INT8, or
	* CL_UNORM_INT16.
	*
	* read_imagef returns floating-point values in the
	* range [-1.0 ... 1.0] for image objects created with
	* image_channel_data_type set to CL_SNORM_INT8,
	* or CL_SNORM_INT16.
	*
	* read_imagef returns floating-point values for image
	* objects created with image_channel_data_type set to
	* CL_HALF_FLOAT or CL_FLOAT.
	*
	* read_imagei and read_imageui return
	* unnormalized signed integer and unsigned integer
	* values respectively. Each channel will be stored in a
	* 32-bit integer.
	*
	* read_imagei can only be used with image objects
	* created with image_channel_data_type set to one of
	* the following values:
	* CL_SIGNED_INT8,
	* CL_SIGNED_INT16 and
	* CL_SIGNED_INT32.
	* If the image_channel_data_type is not one of the
	* above values, the values returned by read_imagei
	* are undefined.
	*
	* read_imageui can only be used with image objects
	* created with image_channel_data_type set to one of
	* the following values:
	* CL_UNSIGNED_INT8,
	* CL_UNSIGNED_INT16 and
	* CL_UNSIGNED_INT32.
	* If the image_channel_data_type is not one of the
	* above values, the values returned by read_imageui
	* are undefined.
	*
	* The read_image{i\|ui} calls support a nearest filter
	* only. The filter_mode specified in sampler
	* must be set to CLK_FILTER_NEAREST; otherwise
	* the values returned are undefined.

	* The read_image{f\|i\|ui} calls that take
	* integer coordinates must use a sampler with
	* normalized coordinates set to
	* CLK_NORMALIZED_COORDS_FALSE and
	* addressing mode set to
	* CLK_ADDRESS_CLAMP_TO_EDGE,
	* CLK_ADDRESS_CLAMP or CLK_ADDRESS_NONE;
	* otherwise the values returned are undefined.
	*
	* Values returned by read_imagef for image objects
	* with image_channel_data_type values not specified
	* in the description above are undefined.
	*/

	float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, int2 coord);
	float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord);

	int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, int2 coord);
	int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord);
	uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, int2 coord);
	uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord);

	float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, int4 coord);
	float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord);

	int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, int4 coord);
	int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord);
	uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, int4 coord);
	uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord);

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
	float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
	float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);

	int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
	int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
	uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
	uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)

	float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, int coord);
	float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord);

	int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, int coord);
	int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord);
	uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, int coord);
	uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord);

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
	float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
	float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);

	int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
	int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
	uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
	uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)

	#ifdef cl_khr_depth_images
	float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord);
	float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, int2 coord);

	float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord);
	float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, int4 coord);
	#endif //cl_khr_depth_images

	#if defined(cl_khr_gl_msaa_sharing)
	float4 __purefn __ovld read_imagef(read_only image2d_msaa_t image, int2 coord, int sample);
	int4 __purefn __ovld read_imagei(read_only image2d_msaa_t image, int2 coord, int sample);
	uint4 __purefn __ovld read_imageui(read_only image2d_msaa_t image, int2 coord, int sample);

	float __purefn __ovld read_imagef(read_only image2d_msaa_depth_t image, int2 coord, int sample);

	float4 __purefn __ovld read_imagef(read_only image2d_array_msaa_t image, int4 coord, int sample);
	int4 __purefn __ovld read_imagei(read_only image2d_array_msaa_t image, int4 coord, int sample);
	uint4 __purefn __ovld read_imageui(read_only image2d_array_msaa_t image, int4 coord, int sample);

	float __purefn __ovld read_imagef(read_only image2d_array_msaa_depth_t image, int4 coord, int sample);
	#endif //cl_khr_gl_msaa_sharing

	// OpenCL Extension v2.0 s9.18 - Mipmaps
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	#ifdef cl_khr_mipmap_image

	float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
	int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
	uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);

	float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
	int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
	uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);

	float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
	int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
	uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);

	float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);

	float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
	int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
	uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);

	float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);

	float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
	int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
	uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);

	float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
	int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
	uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);

	float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
	int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
	uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);

	float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
	int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
	uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);

	float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);

	float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
	int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
	uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);

	float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);

	float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
	int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
	uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);

	#endif //cl_khr_mipmap_image
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)

	/**
	* Sampler-less Image Access
	*/

	float4 __purefn __ovld read_imagef(read_only image1d_t image, int coord);
	int4 __purefn __ovld read_imagei(read_only image1d_t image, int coord);
	uint4 __purefn __ovld read_imageui(read_only image1d_t image, int coord);

	float4 __purefn __ovld read_imagef(read_only image1d_buffer_t image, int coord);
	int4 __purefn __ovld read_imagei(read_only image1d_buffer_t image, int coord);
	uint4 __purefn __ovld read_imageui(read_only image1d_buffer_t image, int coord);

	float4 __purefn __ovld read_imagef(read_only image1d_array_t image, int2 coord);
	int4 __purefn __ovld read_imagei(read_only image1d_array_t image, int2 coord);
	uint4 __purefn __ovld read_imageui(read_only image1d_array_t image, int2 coord);

	float4 __purefn __ovld read_imagef(read_only image2d_t image, int2 coord);
	int4 __purefn __ovld read_imagei(read_only image2d_t image, int2 coord);
	uint4 __purefn __ovld read_imageui(read_only image2d_t image, int2 coord);

	float4 __purefn __ovld read_imagef(read_only image2d_array_t image, int4 coord);
	int4 __purefn __ovld read_imagei(read_only image2d_array_t image, int4 coord);
	uint4 __purefn __ovld read_imageui(read_only image2d_array_t image, int4 coord);

	#ifdef cl_khr_depth_images
	float __purefn __ovld read_imagef(read_only image2d_depth_t image, int2 coord);
	float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, int4 coord);
	#endif //cl_khr_depth_images

	float4 __purefn __ovld read_imagef(read_only image3d_t image, int4 coord);
	int4 __purefn __ovld read_imagei(read_only image3d_t image, int4 coord);
	uint4 __purefn __ovld read_imageui(read_only image3d_t image, int4 coord);

	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)

	// Image read functions returning half4 type
	#ifdef cl_khr_fp16
	half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, int coord);
	half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, float coord);
	half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, int2 coord);
	half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, float2 coord);
	half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, int4 coord);
	half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, float4 coord);
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
	half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, int2 coord);
	half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, float2 coord);
	half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, int4 coord);
	half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, float4 coord);
	/**
	* Sampler-less Image Access
	*/
	half4 __purefn __ovld read_imageh(read_only image1d_t image, int coord);
	half4 __purefn __ovld read_imageh(read_only image2d_t image, int2 coord);
	half4 __purefn __ovld read_imageh(read_only image3d_t image, int4 coord);
	half4 __purefn __ovld read_imageh(read_only image1d_array_t image, int2 coord);
	half4 __purefn __ovld read_imageh(read_only image2d_array_t image, int4 coord);
	half4 __purefn __ovld read_imageh(read_only image1d_buffer_t image, int coord);
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
	#endif //cl_khr_fp16

	// Image read functions for read_write images
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	float4 __purefn __ovld read_imagef(read_write image1d_t image, int coord);
	int4 __purefn __ovld read_imagei(read_write image1d_t image, int coord);
	uint4 __purefn __ovld read_imageui(read_write image1d_t image, int coord);

	float4 __purefn __ovld read_imagef(read_write image1d_buffer_t image, int coord);
	int4 __purefn __ovld read_imagei(read_write image1d_buffer_t image, int coord);
	uint4 __purefn __ovld read_imageui(read_write image1d_buffer_t image, int coord);

	float4 __purefn __ovld read_imagef(read_write image1d_array_t image, int2 coord);
	int4 __purefn __ovld read_imagei(read_write image1d_array_t image, int2 coord);
	uint4 __purefn __ovld read_imageui(read_write image1d_array_t image, int2 coord);

	float4 __purefn __ovld read_imagef(read_write image2d_t image, int2 coord);
	int4 __purefn __ovld read_imagei(read_write image2d_t image, int2 coord);
	uint4 __purefn __ovld read_imageui(read_write image2d_t image, int2 coord);

	float4 __purefn __ovld read_imagef(read_write image2d_array_t image, int4 coord);
	int4 __purefn __ovld read_imagei(read_write image2d_array_t image, int4 coord);
	uint4 __purefn __ovld read_imageui(read_write image2d_array_t image, int4 coord);

	float4 __purefn __ovld read_imagef(read_write image3d_t image, int4 coord);
	int4 __purefn __ovld read_imagei(read_write image3d_t image, int4 coord);
	uint4 __purefn __ovld read_imageui(read_write image3d_t image, int4 coord);

	#ifdef cl_khr_depth_images
	float __purefn __ovld read_imagef(read_write image2d_depth_t image, int2 coord);
	float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, int4 coord);
	#endif //cl_khr_depth_images

	#if cl_khr_gl_msaa_sharing
	float4 __purefn __ovld read_imagef(read_write image2d_msaa_t image, int2 coord, int sample);
	int4 __purefn __ovld read_imagei(read_write image2d_msaa_t image, int2 coord, int sample);
	uint4 __purefn __ovld read_imageui(read_write image2d_msaa_t image, int2 coord, int sample);

	float4 __purefn __ovld read_imagef(read_write image2d_array_msaa_t image, int4 coord, int sample);
	int4 __purefn __ovld read_imagei(read_write image2d_array_msaa_t image, int4 coord, int sample);
	uint4 __purefn __ovld read_imageui(read_write image2d_array_msaa_t image, int4 coord, int sample);

	float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 coord, int sample);
	float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample);
	#endif //cl_khr_gl_msaa_sharing

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	#ifdef cl_khr_mipmap_image
	float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
	int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
	uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);

	float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
	int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
	uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);

	float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
	int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
	uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);

	float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);

	float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
	int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
	uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);

	float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);

	float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
	int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
	uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);

	float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
	int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
	uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);

	float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
	int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
	uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);

	float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
	int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
	uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);

	float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);

	float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
	int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
	uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);

	float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);

	float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
	int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
	uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);

	#endif //cl_khr_mipmap_image
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// Image read functions returning half4 type
	#ifdef cl_khr_fp16
	half4 __purefn __ovld read_imageh(read_write image1d_t image, int coord);
	half4 __purefn __ovld read_imageh(read_write image2d_t image, int2 coord);
	half4 __purefn __ovld read_imageh(read_write image3d_t image, int4 coord);
	half4 __purefn __ovld read_imageh(read_write image1d_array_t image, int2 coord);
	half4 __purefn __ovld read_imageh(read_write image2d_array_t image, int4 coord);
	half4 __purefn __ovld read_imageh(read_write image1d_buffer_t image, int coord);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Write color value to location specified by coordinate
	* (coord.x, coord.y) in the 2D image object specified by image.
	* (coord.x, coord.y) are considered to be unnormalized coordinates
	* and must be in the range 0 ... image width - 1, and 0
	* ... image height - 1.

	* Write color value to location specified by coordinate
	* (coord.x, coord.y) in the 2D image object specified by index
	* (coord.z) of the 2D image array object image_array.
	* (coord.x, coord.y) are considered to be unnormalized
	* coordinates and must be in the range 0 ... image width
	* - 1.
	*
	* Write color value to location specified by coordinate
	* (coord) in the 1D image (buffer) object specified by image.
	* coord is considered to be unnormalized coordinates
	* and must be in the range 0 ... image width - 1.
	*
	* Write color value to location specified by coordinate
	* (coord.x) in the 1D image object specified by index
	* (coord.y) of the 1D image array object image_array.
	* x is considered to be unnormalized coordinates
	* and must be in the range 0 ... image width - 1.
	*
	* Write color value to location specified by coordinate
	* (coord.x, coord.y, coord.z) in the 3D image object specified by image.
	* coord.x & coord.y are considered to be unnormalized coordinates
	* and must be in the range 0 ... image width - 1, and 0
	* ... image height - 1.
	*
	* For mipmap images, use mip-level specified by lod.
	*
	* Appropriate data format conversion to the specified
	* image format is done before writing the color value.
	*
	* write_imagef can only be used with image objects
	* created with image_channel_data_type set to one of
	* the pre-defined packed formats or set to
	* CL_SNORM_INT8, CL_UNORM_INT8,
	* CL_SNORM_INT16, CL_UNORM_INT16,
	* CL_HALF_FLOAT or CL_FLOAT. Appropriate data
	* format conversion will be done to convert channel
	* data from a floating-point value to actual data format
	* in which the channels are stored.
	*
	* write_imagei can only be used with image objects
	* created with image_channel_data_type set to one of
	* the following values:
	* CL_SIGNED_INT8,
	* CL_SIGNED_INT16 and
	* CL_SIGNED_INT32.
	*
	* write_imageui can only be used with image objects
	* created with image_channel_data_type set to one of
	* the following values:
	* CL_UNSIGNED_INT8,
	* CL_UNSIGNED_INT16 and
	* CL_UNSIGNED_INT32.
	*
	* The behavior of write_imagef, write_imagei and
	* write_imageui for image objects created with
	* image_channel_data_type values not specified in
	* the description above or with (x, y) coordinate
	* values that are not in the range (0 ... image width -1,
	* 0 ... image height - 1), respectively, is undefined.
	*/
	void __ovld write_imagef(write_only image2d_t image, int2 coord, float4 color);
	void __ovld write_imagei(write_only image2d_t image, int2 coord, int4 color);
	void __ovld write_imageui(write_only image2d_t image, int2 coord, uint4 color);

	void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, float4 color);
	void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int4 color);
	void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, uint4 color);

	void __ovld write_imagef(write_only image1d_t image, int coord, float4 color);
	void __ovld write_imagei(write_only image1d_t image, int coord, int4 color);
	void __ovld write_imageui(write_only image1d_t image, int coord, uint4 color);

	void __ovld write_imagef(write_only image1d_buffer_t image, int coord, float4 color);
	void __ovld write_imagei(write_only image1d_buffer_t image, int coord, int4 color);
	void __ovld write_imageui(write_only image1d_buffer_t image, int coord, uint4 color);

	void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, float4 color);
	void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color);
	void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color);

	#ifdef cl_khr_3d_image_writes
	void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color);
	void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color);
	void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color);
	#endif

	#ifdef cl_khr_depth_images
	void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, float color);
	void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, float color);
	#endif //cl_khr_depth_images

	// OpenCL Extension v2.0 s9.18 - Mipmaps
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	#ifdef cl_khr_mipmap_image
	void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
	void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
	void __ovld write_imageui(write_only image1d_t image, int coord, int lod, uint4 color);

	void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, int lod, float4 color);
	void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int lod, int4 color);
	void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, int lod, uint4 color);

	void __ovld write_imagef(write_only image2d_t image, int2 coord, int lod, float4 color);
	void __ovld write_imagei(write_only image2d_t image, int2 coord, int lod, int4 color);
	void __ovld write_imageui(write_only image2d_t image, int2 coord, int lod, uint4 color);

	void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, int lod, float4 color);
	void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int lod, int4 color);
	void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, int lod, uint4 color);

	void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);
	void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);

	#ifdef cl_khr_3d_image_writes
	void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);
	void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);
	void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
	#endif
	#endif //cl_khr_mipmap_image
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// Image write functions for half4 type
	#ifdef cl_khr_fp16
	void __ovld write_imageh(write_only image1d_t image, int coord, half4 color);
	void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color);
	#ifdef cl_khr_3d_image_writes
	void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color);
	#endif
	void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color);
	void __ovld write_imageh(write_only image2d_array_t image, int4 coord, half4 color);
	void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 color);
	#endif //cl_khr_fp16

	// Image write functions for read_write images
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld write_imagef(read_write image2d_t image, int2 coord, float4 color);
	void __ovld write_imagei(read_write image2d_t image, int2 coord, int4 color);
	void __ovld write_imageui(read_write image2d_t image, int2 coord, uint4 color);

	void __ovld write_imagef(read_write image2d_array_t image_array, int4 coord, float4 color);
	void __ovld write_imagei(read_write image2d_array_t image_array, int4 coord, int4 color);
	void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, uint4 color);

	void __ovld write_imagef(read_write image1d_t image, int coord, float4 color);
	void __ovld write_imagei(read_write image1d_t image, int coord, int4 color);
	void __ovld write_imageui(read_write image1d_t image, int coord, uint4 color);

	void __ovld write_imagef(read_write image1d_buffer_t image, int coord, float4 color);
	void __ovld write_imagei(read_write image1d_buffer_t image, int coord, int4 color);
	void __ovld write_imageui(read_write image1d_buffer_t image, int coord, uint4 color);

	void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, float4 color);
	void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color);
	void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color);

	#ifdef cl_khr_3d_image_writes
	void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color);
	void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color);
	void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color);
	#endif

	#ifdef cl_khr_depth_images
	void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float color);
	void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, float color);
	#endif //cl_khr_depth_images

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	#ifdef cl_khr_mipmap_image
	void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
	void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
	void __ovld write_imageui(read_write image1d_t image, int coord, int lod, uint4 color);

	void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, int lod, float4 color);
	void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int lod, int4 color);
	void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, int lod, uint4 color);

	void __ovld write_imagef(read_write image2d_t image, int2 coord, int lod, float4 color);
	void __ovld write_imagei(read_write image2d_t image, int2 coord, int lod, int4 color);
	void __ovld write_imageui(read_write image2d_t image, int2 coord, int lod, uint4 color);

	void __ovld write_imagef(read_write image2d_array_t image_array, int4 coord, int lod, float4 color);
	void __ovld write_imagei(read_write image2d_array_t image_array, int4 coord, int lod, int4 color);
	void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, int lod, uint4 color);

	void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color);
	void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color);

	#ifdef cl_khr_3d_image_writes
	void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);
	void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);
	void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
	#endif
	#endif //cl_khr_mipmap_image
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// Image write functions for half4 type
	#ifdef cl_khr_fp16
	void __ovld write_imageh(read_write image1d_t image, int coord, half4 color);
	void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color);
	#ifdef cl_khr_3d_image_writes
	void __ovld write_imageh(read_write image3d_t image, int4 coord, half4 color);
	#endif
	void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 color);
	void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color);
	void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color);
	#endif //cl_khr_fp16
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// Note: In OpenCL v1.0/1.1/1.2, image argument of image query builtin functions does not have
	// access qualifier, which by default assume read_only access qualifier. Image query builtin
	// functions with write_only image argument should also be declared.

	/**
	* Return the image width in pixels.
	*
	*/
	int __ovld __cnfn get_image_width(read_only image1d_t image);
	int __ovld __cnfn get_image_width(read_only image1d_buffer_t image);
	int __ovld __cnfn get_image_width(read_only image2d_t image);
	#ifdef cl_khr_3d_image_writes
	int __ovld __cnfn get_image_width(read_only image3d_t image);
	#endif
	int __ovld __cnfn get_image_width(read_only image1d_array_t image);
	int __ovld __cnfn get_image_width(read_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_width(read_only image2d_depth_t image);
	int __ovld __cnfn get_image_width(read_only image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_width(read_only image2d_msaa_t image);
	int __ovld __cnfn get_image_width(read_only image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_width(read_only image2d_array_msaa_t image);
	int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	int __ovld __cnfn get_image_width(write_only image1d_t image);
	int __ovld __cnfn get_image_width(write_only image1d_buffer_t image);
	int __ovld __cnfn get_image_width(write_only image2d_t image);
	#ifdef cl_khr_3d_image_writes
	int __ovld __cnfn get_image_width(write_only image3d_t image);
	#endif
	int __ovld __cnfn get_image_width(write_only image1d_array_t image);
	int __ovld __cnfn get_image_width(write_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_width(write_only image2d_depth_t image);
	int __ovld __cnfn get_image_width(write_only image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_width(write_only image2d_msaa_t image);
	int __ovld __cnfn get_image_width(write_only image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_width(write_only image2d_array_msaa_t image);
	int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int __ovld __cnfn get_image_width(read_write image1d_t image);
	int __ovld __cnfn get_image_width(read_write image1d_buffer_t image);
	int __ovld __cnfn get_image_width(read_write image2d_t image);
	int __ovld __cnfn get_image_width(read_write image3d_t image);
	int __ovld __cnfn get_image_width(read_write image1d_array_t image);
	int __ovld __cnfn get_image_width(read_write image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_width(read_write image2d_depth_t image);
	int __ovld __cnfn get_image_width(read_write image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_width(read_write image2d_msaa_t image);
	int __ovld __cnfn get_image_width(read_write image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_width(read_write image2d_array_msaa_t image);
	int __ovld __cnfn get_image_width(read_write image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Return the image height in pixels.
	*/
	int __ovld __cnfn get_image_height(read_only image2d_t image);
	int __ovld __cnfn get_image_height(read_only image3d_t image);
	int __ovld __cnfn get_image_height(read_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_height(read_only image2d_depth_t image);
	int __ovld __cnfn get_image_height(read_only image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_height(read_only image2d_msaa_t image);
	int __ovld __cnfn get_image_height(read_only image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_height(read_only image2d_array_msaa_t image);
	int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	int __ovld __cnfn get_image_height(write_only image2d_t image);
	#ifdef cl_khr_3d_image_writes
	int __ovld __cnfn get_image_height(write_only image3d_t image);
	#endif
	int __ovld __cnfn get_image_height(write_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_height(write_only image2d_depth_t image);
	int __ovld __cnfn get_image_height(write_only image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_height(write_only image2d_msaa_t image);
	int __ovld __cnfn get_image_height(write_only image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_height(write_only image2d_array_msaa_t image);
	int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int __ovld __cnfn get_image_height(read_write image2d_t image);
	int __ovld __cnfn get_image_height(read_write image3d_t image);
	int __ovld __cnfn get_image_height(read_write image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_height(read_write image2d_depth_t image);
	int __ovld __cnfn get_image_height(read_write image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_height(read_write image2d_msaa_t image);
	int __ovld __cnfn get_image_height(read_write image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_height(read_write image2d_array_msaa_t image);
	int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Return the image depth in pixels.
	*/
	int __ovld __cnfn get_image_depth(read_only image3d_t image);

	#ifdef cl_khr_3d_image_writes
	int __ovld __cnfn get_image_depth(write_only image3d_t image);
	#endif

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int __ovld __cnfn get_image_depth(read_write image3d_t image);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// OpenCL Extension v2.0 s9.18 - Mipmaps
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	#ifdef cl_khr_mipmap_image
	/**
	* Return the image miplevels.
	*/

	int __ovld get_image_num_mip_levels(read_only image1d_t image);
	int __ovld get_image_num_mip_levels(read_only image2d_t image);
	int __ovld get_image_num_mip_levels(read_only image3d_t image);

	int __ovld get_image_num_mip_levels(write_only image1d_t image);
	int __ovld get_image_num_mip_levels(write_only image2d_t image);
	#ifdef cl_khr_3d_image_writes
	int __ovld get_image_num_mip_levels(write_only image3d_t image);
	#endif

	int __ovld get_image_num_mip_levels(read_write image1d_t image);
	int __ovld get_image_num_mip_levels(read_write image2d_t image);
	int __ovld get_image_num_mip_levels(read_write image3d_t image);

	int __ovld get_image_num_mip_levels(read_only image1d_array_t image);
	int __ovld get_image_num_mip_levels(read_only image2d_array_t image);
	int __ovld get_image_num_mip_levels(read_only image2d_array_depth_t image);
	int __ovld get_image_num_mip_levels(read_only image2d_depth_t image);

	int __ovld get_image_num_mip_levels(write_only image1d_array_t image);
	int __ovld get_image_num_mip_levels(write_only image2d_array_t image);
	int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t image);
	int __ovld get_image_num_mip_levels(write_only image2d_depth_t image);

	int __ovld get_image_num_mip_levels(read_write image1d_array_t image);
	int __ovld get_image_num_mip_levels(read_write image2d_array_t image);
	int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t image);
	int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);

	#endif //cl_khr_mipmap_image
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Return the channel data type. Valid values are:
	* CLK_SNORM_INT8
	* CLK_SNORM_INT16
	* CLK_UNORM_INT8
	* CLK_UNORM_INT16
	* CLK_UNORM_SHORT_565
	* CLK_UNORM_SHORT_555
	* CLK_UNORM_SHORT_101010
	* CLK_SIGNED_INT8
	* CLK_SIGNED_INT16
	* CLK_SIGNED_INT32
	* CLK_UNSIGNED_INT8
	* CLK_UNSIGNED_INT16
	* CLK_UNSIGNED_INT32
	* CLK_HALF_FLOAT
	* CLK_FLOAT
	*/

	int __ovld __cnfn get_image_channel_data_type(read_only image1d_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image1d_buffer_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image2d_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image3d_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image1d_array_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_channel_data_type(read_only image2d_depth_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_channel_data_type(read_only image2d_msaa_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_t image);
	int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image);
	int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image);
	int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image);
	#ifdef cl_khr_3d_image_writes
	int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image);
	#endif
	int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image);
	int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_channel_data_type(write_only image2d_depth_t image);
	int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_channel_data_type(write_only image2d_msaa_t image);
	int __ovld __cnfn get_image_channel_data_type(write_only image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_t image);
	int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int __ovld __cnfn get_image_channel_data_type(read_write image1d_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image2d_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image3d_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image1d_array_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_channel_data_type(read_write image2d_depth_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_t image);
	int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Return the image channel order. Valid values are:
	* CLK_A
	* CLK_R
	* CLK_Rx
	* CLK_RG
	* CLK_RGx
	* CLK_RA
	* CLK_RGB
	* CLK_RGBx
	* CLK_RGBA
	* CLK_ARGB
	* CLK_BGRA
	* CLK_INTENSITY
	* CLK_LUMINANCE
	*/

	int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
	int __ovld __cnfn get_image_channel_order(read_only image1d_buffer_t image);
	int __ovld __cnfn get_image_channel_order(read_only image2d_t image);
	int __ovld __cnfn get_image_channel_order(read_only image3d_t image);
	int __ovld __cnfn get_image_channel_order(read_only image1d_array_t image);
	int __ovld __cnfn get_image_channel_order(read_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_channel_order(read_only image2d_depth_t image);
	int __ovld __cnfn get_image_channel_order(read_only image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_channel_order(read_only image2d_msaa_t image);
	int __ovld __cnfn get_image_channel_order(read_only image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_t image);
	int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	int __ovld __cnfn get_image_channel_order(write_only image1d_t image);
	int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image);
	int __ovld __cnfn get_image_channel_order(write_only image2d_t image);
	#ifdef cl_khr_3d_image_writes
	int __ovld __cnfn get_image_channel_order(write_only image3d_t image);
	#endif
	int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image);
	int __ovld __cnfn get_image_channel_order(write_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_channel_order(write_only image2d_depth_t image);
	int __ovld __cnfn get_image_channel_order(write_only image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_channel_order(write_only image2d_msaa_t image);
	int __ovld __cnfn get_image_channel_order(write_only image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_t image);
	int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int __ovld __cnfn get_image_channel_order(read_write image1d_t image);
	int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t image);
	int __ovld __cnfn get_image_channel_order(read_write image2d_t image);
	int __ovld __cnfn get_image_channel_order(read_write image3d_t image);
	int __ovld __cnfn get_image_channel_order(read_write image1d_array_t image);
	int __ovld __cnfn get_image_channel_order(read_write image2d_array_t image);
	#ifdef cl_khr_depth_images
	int __ovld __cnfn get_image_channel_order(read_write image2d_depth_t image);
	int __ovld __cnfn get_image_channel_order(read_write image2d_array_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_t image);
	int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_depth_t image);
	int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_t image);
	int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Return the 2D image width and height as an int2
	* type. The width is returned in the x component, and
	* the height in the y component.
	*/
	int2 __ovld __cnfn get_image_dim(read_only image2d_t image);
	int2 __ovld __cnfn get_image_dim(read_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int2 __ovld __cnfn get_image_dim(read_only image2d_array_depth_t image);
	int2 __ovld __cnfn get_image_dim(read_only image2d_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int2 __ovld __cnfn get_image_dim(read_only image2d_msaa_t image);
	int2 __ovld __cnfn get_image_dim(read_only image2d_msaa_depth_t image);
	int2 __ovld __cnfn get_image_dim(read_only image2d_array_msaa_t image);
	int2 __ovld __cnfn get_image_dim(read_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	int2 __ovld __cnfn get_image_dim(write_only image2d_t image);
	int2 __ovld __cnfn get_image_dim(write_only image2d_array_t image);
	#ifdef cl_khr_depth_images
	int2 __ovld __cnfn get_image_dim(write_only image2d_array_depth_t image);
	int2 __ovld __cnfn get_image_dim(write_only image2d_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int2 __ovld __cnfn get_image_dim(write_only image2d_msaa_t image);
	int2 __ovld __cnfn get_image_dim(write_only image2d_msaa_depth_t image);
	int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_t image);
	int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int2 __ovld __cnfn get_image_dim(read_write image2d_t image);
	int2 __ovld __cnfn get_image_dim(read_write image2d_array_t image);
	#ifdef cl_khr_depth_images
	int2 __ovld __cnfn get_image_dim(read_write image2d_array_depth_t image);
	int2 __ovld __cnfn get_image_dim(read_write image2d_depth_t image);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_t image);
	int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_depth_t image);
	int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_t image);
	int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image);
	#endif //cl_khr_gl_msaa_sharing
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Return the 3D image width, height, and depth as an
	* int4 type. The width is returned in the x
	* component, height in the y component, depth in the z
	* component and the w component is 0.
	*/
	int4 __ovld __cnfn get_image_dim(read_only image3d_t image);
	#ifdef cl_khr_3d_image_writes
	int4 __ovld __cnfn get_image_dim(write_only image3d_t image);
	#endif
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int4 __ovld __cnfn get_image_dim(read_write image3d_t image);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Return the image array size.
	*/

	size_t __ovld __cnfn get_image_array_size(read_only image1d_array_t image_array);
	size_t __ovld __cnfn get_image_array_size(read_only image2d_array_t image_array);
	#ifdef cl_khr_depth_images
	size_t __ovld __cnfn get_image_array_size(read_only image2d_array_depth_t image_array);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	size_t __ovld __cnfn get_image_array_size(read_only image2d_array_msaa_t image_array);
	size_t __ovld __cnfn get_image_array_size(read_only image2d_array_msaa_depth_t image_array);
	#endif //cl_khr_gl_msaa_sharing

	size_t __ovld __cnfn get_image_array_size(write_only image1d_array_t image_array);
	size_t __ovld __cnfn get_image_array_size(write_only image2d_array_t image_array);
	#ifdef cl_khr_depth_images
	size_t __ovld __cnfn get_image_array_size(write_only image2d_array_depth_t image_array);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_t image_array);
	size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_depth_t image_array);
	#endif //cl_khr_gl_msaa_sharing

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	size_t __ovld __cnfn get_image_array_size(read_write image1d_array_t image_array);
	size_t __ovld __cnfn get_image_array_size(read_write image2d_array_t image_array);
	#ifdef cl_khr_depth_images
	size_t __ovld __cnfn get_image_array_size(read_write image2d_array_depth_t image_array);
	#endif //cl_khr_depth_images
	#if defined(cl_khr_gl_msaa_sharing)
	size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_t image_array);
	size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t image_array);
	#endif //cl_khr_gl_msaa_sharing
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	/**
	* Return the number of samples associated with image
	*/
	#if defined(cl_khr_gl_msaa_sharing)
	int __ovld get_image_num_samples(read_only image2d_msaa_t image);
	int __ovld get_image_num_samples(read_only image2d_msaa_depth_t image);
	int __ovld get_image_num_samples(read_only image2d_array_msaa_t image);
	int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);

	int __ovld get_image_num_samples(write_only image2d_msaa_t image);
	int __ovld get_image_num_samples(write_only image2d_msaa_depth_t image);
	int __ovld get_image_num_samples(write_only image2d_array_msaa_t image);
	int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int __ovld get_image_num_samples(read_write image2d_msaa_t image);
	int __ovld get_image_num_samples(read_write image2d_msaa_depth_t image);
	int __ovld get_image_num_samples(read_write image2d_array_msaa_t image);
	int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	#endif

	// OpenCL v2.0 s6.13.15 - Work-group Functions

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	int __ovld __conv work_group_all(int predicate);
	int __ovld __conv work_group_any(int predicate);

	#ifdef cl_khr_fp16
	half __ovld __conv work_group_broadcast(half a, size_t local_id);
	half __ovld __conv work_group_broadcast(half a, size_t x, size_t y);
	half __ovld __conv work_group_broadcast(half a, size_t x, size_t y, size_t z);
	#endif
	int __ovld __conv work_group_broadcast(int a, size_t local_id);
	int __ovld __conv work_group_broadcast(int a, size_t x, size_t y);
	int __ovld __conv work_group_broadcast(int a, size_t x, size_t y, size_t z);
	uint __ovld __conv work_group_broadcast(uint a, size_t local_id);
	uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y);
	uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y, size_t z);
	long __ovld __conv work_group_broadcast(long a, size_t local_id);
	long __ovld __conv work_group_broadcast(long a, size_t x, size_t y);
	long __ovld __conv work_group_broadcast(long a, size_t x, size_t y, size_t z);
	ulong __ovld __conv work_group_broadcast(ulong a, size_t local_id);
	ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y);
	ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
	float __ovld __conv work_group_broadcast(float a, size_t local_id);
	float __ovld __conv work_group_broadcast(float a, size_t x, size_t y);
	float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z);
	#ifdef cl_khr_fp64
	double __ovld __conv work_group_broadcast(double a, size_t local_id);
	double __ovld __conv work_group_broadcast(double a, size_t x, size_t y);
	double __ovld __conv work_group_broadcast(double a, size_t x, size_t y, size_t z);
	#endif //cl_khr_fp64

	#ifdef cl_khr_fp16
	half __ovld __conv work_group_reduce_add(half x);
	half __ovld __conv work_group_reduce_min(half x);
	half __ovld __conv work_group_reduce_max(half x);
	half __ovld __conv work_group_scan_exclusive_add(half x);
	half __ovld __conv work_group_scan_exclusive_min(half x);
	half __ovld __conv work_group_scan_exclusive_max(half x);
	half __ovld __conv work_group_scan_inclusive_add(half x);
	half __ovld __conv work_group_scan_inclusive_min(half x);
	half __ovld __conv work_group_scan_inclusive_max(half x);
	#endif
	int __ovld __conv work_group_reduce_add(int x);
	int __ovld __conv work_group_reduce_min(int x);
	int __ovld __conv work_group_reduce_max(int x);
	int __ovld __conv work_group_scan_exclusive_add(int x);
	int __ovld __conv work_group_scan_exclusive_min(int x);
	int __ovld __conv work_group_scan_exclusive_max(int x);
	int __ovld __conv work_group_scan_inclusive_add(int x);
	int __ovld __conv work_group_scan_inclusive_min(int x);
	int __ovld __conv work_group_scan_inclusive_max(int x);
	uint __ovld __conv work_group_reduce_add(uint x);
	uint __ovld __conv work_group_reduce_min(uint x);
	uint __ovld __conv work_group_reduce_max(uint x);
	uint __ovld __conv work_group_scan_exclusive_add(uint x);
	uint __ovld __conv work_group_scan_exclusive_min(uint x);
	uint __ovld __conv work_group_scan_exclusive_max(uint x);
	uint __ovld __conv work_group_scan_inclusive_add(uint x);
	uint __ovld __conv work_group_scan_inclusive_min(uint x);
	uint __ovld __conv work_group_scan_inclusive_max(uint x);
	long __ovld __conv work_group_reduce_add(long x);
	long __ovld __conv work_group_reduce_min(long x);
	long __ovld __conv work_group_reduce_max(long x);
	long __ovld __conv work_group_scan_exclusive_add(long x);
	long __ovld __conv work_group_scan_exclusive_min(long x);
	long __ovld __conv work_group_scan_exclusive_max(long x);
	long __ovld __conv work_group_scan_inclusive_add(long x);
	long __ovld __conv work_group_scan_inclusive_min(long x);
	long __ovld __conv work_group_scan_inclusive_max(long x);
	ulong __ovld __conv work_group_reduce_add(ulong x);
	ulong __ovld __conv work_group_reduce_min(ulong x);
	ulong __ovld __conv work_group_reduce_max(ulong x);
	ulong __ovld __conv work_group_scan_exclusive_add(ulong x);
	ulong __ovld __conv work_group_scan_exclusive_min(ulong x);
	ulong __ovld __conv work_group_scan_exclusive_max(ulong x);
	ulong __ovld __conv work_group_scan_inclusive_add(ulong x);
	ulong __ovld __conv work_group_scan_inclusive_min(ulong x);
	ulong __ovld __conv work_group_scan_inclusive_max(ulong x);
	float __ovld __conv work_group_reduce_add(float x);
	float __ovld __conv work_group_reduce_min(float x);
	float __ovld __conv work_group_reduce_max(float x);
	float __ovld __conv work_group_scan_exclusive_add(float x);
	float __ovld __conv work_group_scan_exclusive_min(float x);
	float __ovld __conv work_group_scan_exclusive_max(float x);
	float __ovld __conv work_group_scan_inclusive_add(float x);
	float __ovld __conv work_group_scan_inclusive_min(float x);
	float __ovld __conv work_group_scan_inclusive_max(float x);
	#ifdef cl_khr_fp64
	double __ovld __conv work_group_reduce_add(double x);
	double __ovld __conv work_group_reduce_min(double x);
	double __ovld __conv work_group_reduce_max(double x);
	double __ovld __conv work_group_scan_exclusive_add(double x);
	double __ovld __conv work_group_scan_exclusive_min(double x);
	double __ovld __conv work_group_scan_exclusive_max(double x);
	double __ovld __conv work_group_scan_inclusive_add(double x);
	double __ovld __conv work_group_scan_inclusive_min(double x);
	double __ovld __conv work_group_scan_inclusive_max(double x);
	#endif //cl_khr_fp64

	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// OpenCL v2.0 s6.13.16 - Pipe Functions
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)


	// OpenCL v2.0 s6.13.17 - Enqueue Kernels
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	ndrange_t __ovld ndrange_1D(size_t);
	ndrange_t __ovld ndrange_1D(size_t, size_t);
	ndrange_t __ovld ndrange_1D(size_t, size_t, size_t);

	ndrange_t __ovld ndrange_2D(const size_t[2]);
	ndrange_t __ovld ndrange_2D(const size_t[2], const size_t[2]);
	ndrange_t __ovld ndrange_2D(const size_t[2], const size_t[2], const size_t[2]);

	ndrange_t __ovld ndrange_3D(const size_t[3]);
	ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3]);
	ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3], const size_t[3]);

	-int __ovld enqueue_marker(queue_t, uint, const __private clk_event_t, __private clk_event_t);
	+int __ovld enqueue_marker(queue_t, uint, const clk_event_t, clk_event_t);

	void __ovld retain_event(clk_event_t);

	void __ovld release_event(clk_event_t);

	clk_event_t __ovld create_user_event(void);

	void __ovld set_user_event_status(clk_event_t e, int state);

	bool __ovld is_valid_event (clk_event_t event);

	void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);

	queue_t __ovld get_default_queue(void);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	// OpenCL Extension v2.0 s9.17 - Sub-groups

	#if defined(cl_intel_subgroups) \|\| defined(cl_khr_subgroups)
	// Shared Sub Group Functions
	uint __ovld get_sub_group_size(void);
	uint __ovld get_max_sub_group_size(void);
	uint __ovld get_num_sub_groups(void);
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	uint __ovld get_enqueued_num_sub_groups(void);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	uint __ovld get_sub_group_id(void);
	uint __ovld get_sub_group_local_id(void);

	void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
	#endif //defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	int __ovld __conv sub_group_all(int predicate);
	int __ovld __conv sub_group_any(int predicate);

	int __ovld __conv sub_group_broadcast(int x, uint sub_group_local_id);
	uint __ovld __conv sub_group_broadcast(uint x, uint sub_group_local_id);
	long __ovld __conv sub_group_broadcast(long x, uint sub_group_local_id);
	ulong __ovld __conv sub_group_broadcast(ulong x, uint sub_group_local_id);
	float __ovld __conv sub_group_broadcast(float x, uint sub_group_local_id);

	int __ovld __conv sub_group_reduce_add(int x);
	uint __ovld __conv sub_group_reduce_add(uint x);
	long __ovld __conv sub_group_reduce_add(long x);
	ulong __ovld __conv sub_group_reduce_add(ulong x);
	float __ovld __conv sub_group_reduce_add(float x);
	int __ovld __conv sub_group_reduce_min(int x);
	uint __ovld __conv sub_group_reduce_min(uint x);
	long __ovld __conv sub_group_reduce_min(long x);
	ulong __ovld __conv sub_group_reduce_min(ulong x);
	float __ovld __conv sub_group_reduce_min(float x);
	int __ovld __conv sub_group_reduce_max(int x);
	uint __ovld __conv sub_group_reduce_max(uint x);
	long __ovld __conv sub_group_reduce_max(long x);
	ulong __ovld __conv sub_group_reduce_max(ulong x);
	float __ovld __conv sub_group_reduce_max(float x);

	int __ovld __conv sub_group_scan_exclusive_add(int x);
	uint __ovld __conv sub_group_scan_exclusive_add(uint x);
	long __ovld __conv sub_group_scan_exclusive_add(long x);
	ulong __ovld __conv sub_group_scan_exclusive_add(ulong x);
	float __ovld __conv sub_group_scan_exclusive_add(float x);
	int __ovld __conv sub_group_scan_exclusive_min(int x);
	uint __ovld __conv sub_group_scan_exclusive_min(uint x);
	long __ovld __conv sub_group_scan_exclusive_min(long x);
	ulong __ovld __conv sub_group_scan_exclusive_min(ulong x);
	float __ovld __conv sub_group_scan_exclusive_min(float x);
	int __ovld __conv sub_group_scan_exclusive_max(int x);
	uint __ovld __conv sub_group_scan_exclusive_max(uint x);
	long __ovld __conv sub_group_scan_exclusive_max(long x);
	ulong __ovld __conv sub_group_scan_exclusive_max(ulong x);
	float __ovld __conv sub_group_scan_exclusive_max(float x);

	int __ovld __conv sub_group_scan_inclusive_add(int x);
	uint __ovld __conv sub_group_scan_inclusive_add(uint x);
	long __ovld __conv sub_group_scan_inclusive_add(long x);
	ulong __ovld __conv sub_group_scan_inclusive_add(ulong x);
	float __ovld __conv sub_group_scan_inclusive_add(float x);
	int __ovld __conv sub_group_scan_inclusive_min(int x);
	uint __ovld __conv sub_group_scan_inclusive_min(uint x);
	long __ovld __conv sub_group_scan_inclusive_min(long x);
	ulong __ovld __conv sub_group_scan_inclusive_min(ulong x);
	float __ovld __conv sub_group_scan_inclusive_min(float x);
	int __ovld __conv sub_group_scan_inclusive_max(int x);
	uint __ovld __conv sub_group_scan_inclusive_max(uint x);
	long __ovld __conv sub_group_scan_inclusive_max(long x);
	ulong __ovld __conv sub_group_scan_inclusive_max(ulong x);
	float __ovld __conv sub_group_scan_inclusive_max(float x);

	#ifdef cl_khr_fp16
	half __ovld __conv sub_group_broadcast(half x, uint sub_group_local_id);
	half __ovld __conv sub_group_reduce_add(half x);
	half __ovld __conv sub_group_reduce_min(half x);
	half __ovld __conv sub_group_reduce_max(half x);
	half __ovld __conv sub_group_scan_exclusive_add(half x);
	half __ovld __conv sub_group_scan_exclusive_min(half x);
	half __ovld __conv sub_group_scan_exclusive_max(half x);
	half __ovld __conv sub_group_scan_inclusive_add(half x);
	half __ovld __conv sub_group_scan_inclusive_min(half x);
	half __ovld __conv sub_group_scan_inclusive_max(half x);
	#endif //cl_khr_fp16

	#ifdef cl_khr_fp64
	double __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id);
	double __ovld __conv sub_group_reduce_add(double x);
	double __ovld __conv sub_group_reduce_min(double x);
	double __ovld __conv sub_group_reduce_max(double x);
	double __ovld __conv sub_group_scan_exclusive_add(double x);
	double __ovld __conv sub_group_scan_exclusive_min(double x);
	double __ovld __conv sub_group_scan_exclusive_max(double x);
	double __ovld __conv sub_group_scan_inclusive_add(double x);
	double __ovld __conv sub_group_scan_inclusive_min(double x);
	double __ovld __conv sub_group_scan_inclusive_max(double x);
	#endif //cl_khr_fp64

	#endif //cl_khr_subgroups cl_intel_subgroups

	#if defined(cl_intel_subgroups)
	// Intel-Specific Sub Group Functions
	float __ovld __conv intel_sub_group_shuffle( float x, uint c );
	float2 __ovld __conv intel_sub_group_shuffle( float2 x, uint c );
	float3 __ovld __conv intel_sub_group_shuffle( float3 x, uint c );
	float4 __ovld __conv intel_sub_group_shuffle( float4 x, uint c );
	float8 __ovld __conv intel_sub_group_shuffle( float8 x, uint c );
	float16 __ovld __conv intel_sub_group_shuffle( float16 x, uint c );

	int __ovld __conv intel_sub_group_shuffle( int x, uint c );
	int2 __ovld __conv intel_sub_group_shuffle( int2 x, uint c );
	int3 __ovld __conv intel_sub_group_shuffle( int3 x, uint c );
	int4 __ovld __conv intel_sub_group_shuffle( int4 x, uint c );
	int8 __ovld __conv intel_sub_group_shuffle( int8 x, uint c );
	int16 __ovld __conv intel_sub_group_shuffle( int16 x, uint c );

	uint __ovld __conv intel_sub_group_shuffle( uint x, uint c );
	uint2 __ovld __conv intel_sub_group_shuffle( uint2 x, uint c );
	uint3 __ovld __conv intel_sub_group_shuffle( uint3 x, uint c );
	uint4 __ovld __conv intel_sub_group_shuffle( uint4 x, uint c );
	uint8 __ovld __conv intel_sub_group_shuffle( uint8 x, uint c );
	uint16 __ovld __conv intel_sub_group_shuffle( uint16 x, uint c );

	long __ovld __conv intel_sub_group_shuffle( long x, uint c );
	ulong __ovld __conv intel_sub_group_shuffle( ulong x, uint c );

	float __ovld __conv intel_sub_group_shuffle_down( float cur, float next, uint c );
	float2 __ovld __conv intel_sub_group_shuffle_down( float2 cur, float2 next, uint c );
	float3 __ovld __conv intel_sub_group_shuffle_down( float3 cur, float3 next, uint c );
	float4 __ovld __conv intel_sub_group_shuffle_down( float4 cur, float4 next, uint c );
	float8 __ovld __conv intel_sub_group_shuffle_down( float8 cur, float8 next, uint c );
	float16 __ovld __conv intel_sub_group_shuffle_down( float16 cur, float16 next, uint c );

	int __ovld __conv intel_sub_group_shuffle_down( int cur, int next, uint c );
	int2 __ovld __conv intel_sub_group_shuffle_down( int2 cur, int2 next, uint c );
	int3 __ovld __conv intel_sub_group_shuffle_down( int3 cur, int3 next, uint c );
	int4 __ovld __conv intel_sub_group_shuffle_down( int4 cur, int4 next, uint c );
	int8 __ovld __conv intel_sub_group_shuffle_down( int8 cur, int8 next, uint c );
	int16 __ovld __conv intel_sub_group_shuffle_down( int16 cur, int16 next, uint c );

	uint __ovld __conv intel_sub_group_shuffle_down( uint cur, uint next, uint c );
	uint2 __ovld __conv intel_sub_group_shuffle_down( uint2 cur, uint2 next, uint c );
	uint3 __ovld __conv intel_sub_group_shuffle_down( uint3 cur, uint3 next, uint c );
	uint4 __ovld __conv intel_sub_group_shuffle_down( uint4 cur, uint4 next, uint c );
	uint8 __ovld __conv intel_sub_group_shuffle_down( uint8 cur, uint8 next, uint c );
	uint16 __ovld __conv intel_sub_group_shuffle_down( uint16 cur, uint16 next, uint c );

	long __ovld __conv intel_sub_group_shuffle_down( long prev, long cur, uint c );
	ulong __ovld __conv intel_sub_group_shuffle_down( ulong prev, ulong cur, uint c );

	float __ovld __conv intel_sub_group_shuffle_up( float prev, float cur, uint c );
	float2 __ovld __conv intel_sub_group_shuffle_up( float2 prev, float2 cur, uint c );
	float3 __ovld __conv intel_sub_group_shuffle_up( float3 prev, float3 cur, uint c );
	float4 __ovld __conv intel_sub_group_shuffle_up( float4 prev, float4 cur, uint c );
	float8 __ovld __conv intel_sub_group_shuffle_up( float8 prev, float8 cur, uint c );
	float16 __ovld __conv intel_sub_group_shuffle_up( float16 prev, float16 cur, uint c );

	int __ovld __conv intel_sub_group_shuffle_up( int prev, int cur, uint c );
	int2 __ovld __conv intel_sub_group_shuffle_up( int2 prev, int2 cur, uint c );
	int3 __ovld __conv intel_sub_group_shuffle_up( int3 prev, int3 cur, uint c );
	int4 __ovld __conv intel_sub_group_shuffle_up( int4 prev, int4 cur, uint c );
	int8 __ovld __conv intel_sub_group_shuffle_up( int8 prev, int8 cur, uint c );
	int16 __ovld __conv intel_sub_group_shuffle_up( int16 prev, int16 cur, uint c );

	uint __ovld __conv intel_sub_group_shuffle_up( uint prev, uint cur, uint c );
	uint2 __ovld __conv intel_sub_group_shuffle_up( uint2 prev, uint2 cur, uint c );
	uint3 __ovld __conv intel_sub_group_shuffle_up( uint3 prev, uint3 cur, uint c );
	uint4 __ovld __conv intel_sub_group_shuffle_up( uint4 prev, uint4 cur, uint c );
	uint8 __ovld __conv intel_sub_group_shuffle_up( uint8 prev, uint8 cur, uint c );
	uint16 __ovld __conv intel_sub_group_shuffle_up( uint16 prev, uint16 cur, uint c );

	long __ovld __conv intel_sub_group_shuffle_up( long prev, long cur, uint c );
	ulong __ovld __conv intel_sub_group_shuffle_up( ulong prev, ulong cur, uint c );

	float __ovld __conv intel_sub_group_shuffle_xor( float x, uint c );
	float2 __ovld __conv intel_sub_group_shuffle_xor( float2 x, uint c );
	float3 __ovld __conv intel_sub_group_shuffle_xor( float3 x, uint c );
	float4 __ovld __conv intel_sub_group_shuffle_xor( float4 x, uint c );
	float8 __ovld __conv intel_sub_group_shuffle_xor( float8 x, uint c );
	float16 __ovld __conv intel_sub_group_shuffle_xor( float16 x, uint c );

	int __ovld __conv intel_sub_group_shuffle_xor( int x, uint c );
	int2 __ovld __conv intel_sub_group_shuffle_xor( int2 x, uint c );
	int3 __ovld __conv intel_sub_group_shuffle_xor( int3 x, uint c );
	int4 __ovld __conv intel_sub_group_shuffle_xor( int4 x, uint c );
	int8 __ovld __conv intel_sub_group_shuffle_xor( int8 x, uint c );
	int16 __ovld __conv intel_sub_group_shuffle_xor( int16 x, uint c );

	uint __ovld __conv intel_sub_group_shuffle_xor( uint x, uint c );
	uint2 __ovld __conv intel_sub_group_shuffle_xor( uint2 x, uint c );
	uint3 __ovld __conv intel_sub_group_shuffle_xor( uint3 x, uint c );
	uint4 __ovld __conv intel_sub_group_shuffle_xor( uint4 x, uint c );
	uint8 __ovld __conv intel_sub_group_shuffle_xor( uint8 x, uint c );
	uint16 __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c );

	long __ovld __conv intel_sub_group_shuffle_xor( long x, uint c );
	ulong __ovld __conv intel_sub_group_shuffle_xor( ulong x, uint c );

	uint __ovld __conv intel_sub_group_block_read( read_only image2d_t image, int2 coord );
	uint2 __ovld __conv intel_sub_group_block_read2( read_only image2d_t image, int2 coord );
	uint4 __ovld __conv intel_sub_group_block_read4( read_only image2d_t image, int2 coord );
	uint8 __ovld __conv intel_sub_group_block_read8( read_only image2d_t image, int2 coord );

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	uint __ovld __conv intel_sub_group_block_read(read_write image2d_t image, int2 coord);
	uint2 __ovld __conv intel_sub_group_block_read2(read_write image2d_t image, int2 coord);
	uint4 __ovld __conv intel_sub_group_block_read4(read_write image2d_t image, int2 coord);
	uint8 __ovld __conv intel_sub_group_block_read8(read_write image2d_t image, int2 coord);
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	uint __ovld __conv intel_sub_group_block_read( const __global uint* p );
	uint2 __ovld __conv intel_sub_group_block_read2( const __global uint* p );
	uint4 __ovld __conv intel_sub_group_block_read4( const __global uint* p );
	uint8 __ovld __conv intel_sub_group_block_read8( const __global uint* p );

	void __ovld __conv intel_sub_group_block_write(write_only image2d_t image, int2 coord, uint data);
	void __ovld __conv intel_sub_group_block_write2(write_only image2d_t image, int2 coord, uint2 data);
	void __ovld __conv intel_sub_group_block_write4(write_only image2d_t image, int2 coord, uint4 data);
	void __ovld __conv intel_sub_group_block_write8(write_only image2d_t image, int2 coord, uint8 data);

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld __conv intel_sub_group_block_write(read_write image2d_t image, int2 coord, uint data);
	void __ovld __conv intel_sub_group_block_write2(read_write image2d_t image, int2 coord, uint2 data);
	void __ovld __conv intel_sub_group_block_write4(read_write image2d_t image, int2 coord, uint4 data);
	void __ovld __conv intel_sub_group_block_write8(read_write image2d_t image, int2 coord, uint8 data);
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	void __ovld __conv intel_sub_group_block_write( __global uint* p, uint data );
	void __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data );
	void __ovld __conv intel_sub_group_block_write4( __global uint* p, uint4 data );
	void __ovld __conv intel_sub_group_block_write8( __global uint* p, uint8 data );

	#ifdef cl_khr_fp16
	half __ovld __conv intel_sub_group_shuffle( half x, uint c );
	half __ovld __conv intel_sub_group_shuffle_down( half prev, half cur, uint c );
	half __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c );
	half __ovld __conv intel_sub_group_shuffle_xor( half x, uint c );
	#endif

	#if defined(cl_khr_fp64)
	double __ovld __conv intel_sub_group_shuffle( double x, uint c );
	double __ovld __conv intel_sub_group_shuffle_down( double prev, double cur, uint c );
	double __ovld __conv intel_sub_group_shuffle_up( double prev, double cur, uint c );
	double __ovld __conv intel_sub_group_shuffle_xor( double x, uint c );
	#endif

	#endif //cl_intel_subgroups

	#if defined(cl_intel_subgroups_short)
	short __ovld __conv intel_sub_group_broadcast( short x, uint sub_group_local_id );
	short2 __ovld __conv intel_sub_group_broadcast( short2 x, uint sub_group_local_id );
	short3 __ovld __conv intel_sub_group_broadcast( short3 x, uint sub_group_local_id );
	short4 __ovld __conv intel_sub_group_broadcast( short4 x, uint sub_group_local_id );
	short8 __ovld __conv intel_sub_group_broadcast( short8 x, uint sub_group_local_id );

	ushort __ovld __conv intel_sub_group_broadcast( ushort x, uint sub_group_local_id );
	ushort2 __ovld __conv intel_sub_group_broadcast( ushort2 x, uint sub_group_local_id );
	ushort3 __ovld __conv intel_sub_group_broadcast( ushort3 x, uint sub_group_local_id );
	ushort4 __ovld __conv intel_sub_group_broadcast( ushort4 x, uint sub_group_local_id );
	ushort8 __ovld __conv intel_sub_group_broadcast( ushort8 x, uint sub_group_local_id );

	short __ovld __conv intel_sub_group_shuffle( short x, uint c );
	short2 __ovld __conv intel_sub_group_shuffle( short2 x, uint c );
	short3 __ovld __conv intel_sub_group_shuffle( short3 x, uint c );
	short4 __ovld __conv intel_sub_group_shuffle( short4 x, uint c );
	short8 __ovld __conv intel_sub_group_shuffle( short8 x, uint c );
	short16 __ovld __conv intel_sub_group_shuffle( short16 x, uint c);

	ushort __ovld __conv intel_sub_group_shuffle( ushort x, uint c );
	ushort2 __ovld __conv intel_sub_group_shuffle( ushort2 x, uint c );
	ushort3 __ovld __conv intel_sub_group_shuffle( ushort3 x, uint c );
	ushort4 __ovld __conv intel_sub_group_shuffle( ushort4 x, uint c );
	ushort8 __ovld __conv intel_sub_group_shuffle( ushort8 x, uint c );
	ushort16 __ovld __conv intel_sub_group_shuffle( ushort16 x, uint c );

	short __ovld __conv intel_sub_group_shuffle_down( short cur, short next, uint c );
	short2 __ovld __conv intel_sub_group_shuffle_down( short2 cur, short2 next, uint c );
	short3 __ovld __conv intel_sub_group_shuffle_down( short3 cur, short3 next, uint c );
	short4 __ovld __conv intel_sub_group_shuffle_down( short4 cur, short4 next, uint c );
	short8 __ovld __conv intel_sub_group_shuffle_down( short8 cur, short8 next, uint c );
	short16 __ovld __conv intel_sub_group_shuffle_down( short16 cur, short16 next, uint c );

	ushort __ovld __conv intel_sub_group_shuffle_down( ushort cur, ushort next, uint c );
	ushort2 __ovld __conv intel_sub_group_shuffle_down( ushort2 cur, ushort2 next, uint c );
	ushort3 __ovld __conv intel_sub_group_shuffle_down( ushort3 cur, ushort3 next, uint c );
	ushort4 __ovld __conv intel_sub_group_shuffle_down( ushort4 cur, ushort4 next, uint c );
	ushort8 __ovld __conv intel_sub_group_shuffle_down( ushort8 cur, ushort8 next, uint c );
	ushort16 __ovld __conv intel_sub_group_shuffle_down( ushort16 cur, ushort16 next, uint c );

	short __ovld __conv intel_sub_group_shuffle_up( short cur, short next, uint c );
	short2 __ovld __conv intel_sub_group_shuffle_up( short2 cur, short2 next, uint c );
	short3 __ovld __conv intel_sub_group_shuffle_up( short3 cur, short3 next, uint c );
	short4 __ovld __conv intel_sub_group_shuffle_up( short4 cur, short4 next, uint c );
	short8 __ovld __conv intel_sub_group_shuffle_up( short8 cur, short8 next, uint c );
	short16 __ovld __conv intel_sub_group_shuffle_up( short16 cur, short16 next, uint c );

	ushort __ovld __conv intel_sub_group_shuffle_up( ushort cur, ushort next, uint c );
	ushort2 __ovld __conv intel_sub_group_shuffle_up( ushort2 cur, ushort2 next, uint c );
	ushort3 __ovld __conv intel_sub_group_shuffle_up( ushort3 cur, ushort3 next, uint c );
	ushort4 __ovld __conv intel_sub_group_shuffle_up( ushort4 cur, ushort4 next, uint c );
	ushort8 __ovld __conv intel_sub_group_shuffle_up( ushort8 cur, ushort8 next, uint c );
	ushort16 __ovld __conv intel_sub_group_shuffle_up( ushort16 cur, ushort16 next, uint c );

	short __ovld __conv intel_sub_group_shuffle_xor( short x, uint c );
	short2 __ovld __conv intel_sub_group_shuffle_xor( short2 x, uint c );
	short3 __ovld __conv intel_sub_group_shuffle_xor( short3 x, uint c );
	short4 __ovld __conv intel_sub_group_shuffle_xor( short4 x, uint c );
	short8 __ovld __conv intel_sub_group_shuffle_xor( short8 x, uint c );
	short16 __ovld __conv intel_sub_group_shuffle_xor( short16 x, uint c );

	ushort __ovld __conv intel_sub_group_shuffle_xor( ushort x, uint c );
	ushort2 __ovld __conv intel_sub_group_shuffle_xor( ushort2 x, uint c );
	ushort3 __ovld __conv intel_sub_group_shuffle_xor( ushort3 x, uint c );
	ushort4 __ovld __conv intel_sub_group_shuffle_xor( ushort4 x, uint c );
	ushort8 __ovld __conv intel_sub_group_shuffle_xor( ushort8 x, uint c );
	ushort16 __ovld __conv intel_sub_group_shuffle_xor( ushort16 x, uint c );

	short __ovld __conv intel_sub_group_reduce_add( short x );
	ushort __ovld __conv intel_sub_group_reduce_add( ushort x );
	short __ovld __conv intel_sub_group_reduce_min( short x );
	ushort __ovld __conv intel_sub_group_reduce_min( ushort x );
	short __ovld __conv intel_sub_group_reduce_max( short x );
	ushort __ovld __conv intel_sub_group_reduce_max( ushort x );

	short __ovld __conv intel_sub_group_scan_exclusive_add( short x );
	ushort __ovld __conv intel_sub_group_scan_exclusive_add( ushort x );
	short __ovld __conv intel_sub_group_scan_exclusive_min( short x );
	ushort __ovld __conv intel_sub_group_scan_exclusive_min( ushort x );
	short __ovld __conv intel_sub_group_scan_exclusive_max( short x );
	ushort __ovld __conv intel_sub_group_scan_exclusive_max( ushort x );

	short __ovld __conv intel_sub_group_scan_inclusive_add( short x );
	ushort __ovld __conv intel_sub_group_scan_inclusive_add( ushort x );
	short __ovld __conv intel_sub_group_scan_inclusive_min( short x );
	ushort __ovld __conv intel_sub_group_scan_inclusive_min( ushort x );
	short __ovld __conv intel_sub_group_scan_inclusive_max( short x );
	ushort __ovld __conv intel_sub_group_scan_inclusive_max( ushort x );

	uint __ovld __conv intel_sub_group_block_read_ui( read_only image2d_t image, int2 byte_coord );
	uint2 __ovld __conv intel_sub_group_block_read_ui2( read_only image2d_t image, int2 byte_coord );
	uint4 __ovld __conv intel_sub_group_block_read_ui4( read_only image2d_t image, int2 byte_coord );
	uint8 __ovld __conv intel_sub_group_block_read_ui8( read_only image2d_t image, int2 byte_coord );

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	uint __ovld __conv intel_sub_group_block_read_ui( read_write image2d_t image, int2 byte_coord );
	uint2 __ovld __conv intel_sub_group_block_read_ui2( read_write image2d_t image, int2 byte_coord );
	uint4 __ovld __conv intel_sub_group_block_read_ui4( read_write image2d_t image, int2 byte_coord );
	uint8 __ovld __conv intel_sub_group_block_read_ui8( read_write image2d_t image, int2 byte_coord );
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	uint __ovld __conv intel_sub_group_block_read_ui( const __global uint* p );
	uint2 __ovld __conv intel_sub_group_block_read_ui2( const __global uint* p );
	uint4 __ovld __conv intel_sub_group_block_read_ui4( const __global uint* p );
	uint8 __ovld __conv intel_sub_group_block_read_ui8( const __global uint* p );

	void __ovld __conv intel_sub_group_block_write_ui( read_only image2d_t image, int2 byte_coord, uint data );
	void __ovld __conv intel_sub_group_block_write_ui2( read_only image2d_t image, int2 byte_coord, uint2 data );
	void __ovld __conv intel_sub_group_block_write_ui4( read_only image2d_t image, int2 byte_coord, uint4 data );
	void __ovld __conv intel_sub_group_block_write_ui8( read_only image2d_t image, int2 byte_coord, uint8 data );

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld __conv intel_sub_group_block_write_ui( read_write image2d_t image, int2 byte_coord, uint data );
	void __ovld __conv intel_sub_group_block_write_ui2( read_write image2d_t image, int2 byte_coord, uint2 data );
	void __ovld __conv intel_sub_group_block_write_ui4( read_write image2d_t image, int2 byte_coord, uint4 data );
	void __ovld __conv intel_sub_group_block_write_ui8( read_write image2d_t image, int2 byte_coord, uint8 data );
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	void __ovld __conv intel_sub_group_block_write_ui( __global uint* p, uint data );
	void __ovld __conv intel_sub_group_block_write_ui2( __global uint* p, uint2 data );
	void __ovld __conv intel_sub_group_block_write_ui4( __global uint* p, uint4 data );
	void __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint8 data );

	ushort __ovld __conv intel_sub_group_block_read_us( read_only image2d_t image, int2 coord );
	ushort2 __ovld __conv intel_sub_group_block_read_us2( read_only image2d_t image, int2 coord );
	ushort4 __ovld __conv intel_sub_group_block_read_us4( read_only image2d_t image, int2 coord );
	ushort8 __ovld __conv intel_sub_group_block_read_us8( read_only image2d_t image, int2 coord );

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	ushort __ovld __conv intel_sub_group_block_read_us(read_write image2d_t image, int2 coord);
	ushort2 __ovld __conv intel_sub_group_block_read_us2(read_write image2d_t image, int2 coord);
	ushort4 __ovld __conv intel_sub_group_block_read_us4(read_write image2d_t image, int2 coord);
	ushort8 __ovld __conv intel_sub_group_block_read_us8(read_write image2d_t image, int2 coord);
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	ushort __ovld __conv intel_sub_group_block_read_us( const __global ushort* p );
	ushort2 __ovld __conv intel_sub_group_block_read_us2( const __global ushort* p );
	ushort4 __ovld __conv intel_sub_group_block_read_us4( const __global ushort* p );
	ushort8 __ovld __conv intel_sub_group_block_read_us8( const __global ushort* p );

	void __ovld __conv intel_sub_group_block_write_us(write_only image2d_t image, int2 coord, ushort data);
	void __ovld __conv intel_sub_group_block_write_us2(write_only image2d_t image, int2 coord, ushort2 data);
	void __ovld __conv intel_sub_group_block_write_us4(write_only image2d_t image, int2 coord, ushort4 data);
	void __ovld __conv intel_sub_group_block_write_us8(write_only image2d_t image, int2 coord, ushort8 data);

	#if defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
	void __ovld __conv intel_sub_group_block_write_us(read_write image2d_t image, int2 coord, ushort data);
	void __ovld __conv intel_sub_group_block_write_us2(read_write image2d_t image, int2 coord, ushort2 data);
	void __ovld __conv intel_sub_group_block_write_us4(read_write image2d_t image, int2 coord, ushort4 data);
	void __ovld __conv intel_sub_group_block_write_us8(read_write image2d_t image, int2 coord, ushort8 data);
	#endif // defined(__OPENCL_CPP_VERSION__) \|\| (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

	void __ovld __conv intel_sub_group_block_write_us( __global ushort* p, ushort data );
	void __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data );
	void __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data );
	void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data );
	#endif // cl_intel_subgroups_short

	#ifdef cl_intel_device_side_avc_motion_estimation
	#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin

	// MCE built-in functions
	uchar __ovld
	intel_sub_group_avc_mce_get_default_inter_base_multi_reference_penalty(
	uchar slice_type, uchar qp);
	ulong __ovld intel_sub_group_avc_mce_get_default_inter_shape_penalty(
	uchar slice_type, uchar qp);
	uchar __ovld intel_sub_group_avc_mce_get_default_inter_direction_penalty(
	uchar slice_type, uchar qp);
	uint __ovld intel_sub_group_avc_mce_get_default_intra_luma_shape_penalty(
	uchar slice_type, uchar qp);
	uint2 __ovld
	intel_sub_group_avc_mce_get_default_inter_motion_vector_cost_table(
	uchar slice_type, uchar qp);
	uchar __ovld intel_sub_group_avc_mce_get_default_intra_luma_mode_penalty(
	uchar slice_type, uchar qp);

	uint2 __ovld intel_sub_group_avc_mce_get_default_high_penalty_cost_table();
	uint2 __ovld intel_sub_group_avc_mce_get_default_medium_penalty_cost_table();
	uint2 __ovld intel_sub_group_avc_mce_get_default_low_penalty_cost_table();
	uint __ovld intel_sub_group_avc_mce_get_default_non_dc_luma_intra_penalty();
	uchar __ovld
	intel_sub_group_avc_mce_get_default_intra_chroma_mode_base_penalty();

	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_mce_set_inter_base_multi_reference_penalty(
	uchar reference_base_penalty, intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_mce_set_inter_shape_penalty(
	ulong packed_shape_penalty, intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_mce_set_inter_direction_penalty(
	uchar direction_cost, intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_mce_set_motion_vector_cost_function(
	ulong packed_cost_center_delta, uint2 packed_cost_table,
	uchar cost_precision, intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_mce_set_ac_only_haar(
	intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_mce_set_source_interlaced_field_polarity(
	uchar src_field_polarity, intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_mce_set_single_reference_interlaced_field_polarity(
	uchar ref_field_polarity, intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_mce_set_dual_reference_interlaced_field_polarities(
	uchar fwd_ref_field_polarity, uchar bwd_ref_field_polarity,
	intel_sub_group_avc_mce_payload_t payload);

	ulong __ovld intel_sub_group_avc_mce_get_motion_vectors(
	intel_sub_group_avc_mce_result_t result);
	ushort __ovld intel_sub_group_avc_mce_get_inter_distortions(
	intel_sub_group_avc_mce_result_t result);
	ushort __ovld intel_sub_group_avc_mce_get_best_inter_distortion(
	intel_sub_group_avc_mce_result_t result);
	uchar __ovld intel_sub_group_avc_mce_get_inter_major_shape(
	intel_sub_group_avc_mce_result_t result);
	uchar __ovld intel_sub_group_avc_mce_get_inter_minor_shapes(
	intel_sub_group_avc_mce_result_t result);
	uchar __ovld intel_sub_group_avc_mce_get_inter_directions(
	intel_sub_group_avc_mce_result_t result);
	uchar __ovld intel_sub_group_avc_mce_get_inter_motion_vector_count(
	intel_sub_group_avc_mce_result_t result);
	uint __ovld intel_sub_group_avc_mce_get_inter_reference_ids(
	intel_sub_group_avc_mce_result_t result);
	uchar __ovld
	intel_sub_group_avc_mce_get_inter_reference_interlaced_field_polarities(
	uint packed_reference_ids, uint packed_reference_parameter_field_polarities,
	intel_sub_group_avc_mce_result_t result);

	// IME built-in functions
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_initialize(
	ushort2 src_coord, uchar partition_mask, uchar sad_adjustment);
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_single_reference(
	short2 ref_offset, uchar search_window_config,
	intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_dual_reference(
	short2 fwd_ref_offset, short2 bwd_ref_offset, uchar search_window_config,
	intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_max_motion_vector_count(
	uchar max_motion_vector_count, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_unidirectional_mix_disable(
	intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_early_search_termination_threshold(
	uchar threshold, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_weighted_sad(
	uint packed_sad_weights, intel_sub_group_avc_ime_payload_t payload);

	__attribute__((deprecated("If you use the latest Intel driver, please use "
	"intel_sub_group_avc_ime_ref_window_size instead",
	"intel_sub_group_avc_ime_ref_window_size")))
	ushort2 __ovld
	intel_sub_group_ime_ref_window_size(uchar search_window_config, char dual_ref);
	ushort2 __ovld intel_sub_group_avc_ime_ref_window_size(
	uchar search_window_config, char dual_ref);
	short2 __ovld intel_sub_group_avc_ime_adjust_ref_offset(
	short2 ref_offset, ushort2 src_coord, ushort2 ref_window_size,
	ushort2 image_size);

	intel_sub_group_avc_ime_result_t __ovld
	intel_sub_group_avc_ime_evaluate_with_single_reference(
	read_only image2d_t src_image, read_only image2d_t ref_image,
	sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_result_t __ovld
	intel_sub_group_avc_ime_evaluate_with_dual_reference(
	read_only image2d_t src_image, read_only image2d_t fwd_ref_image,
	read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler,
	intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_result_single_reference_streamout_t __ovld
	intel_sub_group_avc_ime_evaluate_with_single_reference_streamout(
	read_only image2d_t src_image, read_only image2d_t ref_image,
	sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_result_dual_reference_streamout_t __ovld
	intel_sub_group_avc_ime_evaluate_with_dual_reference_streamout(
	read_only image2d_t src_image, read_only image2d_t fwd_ref_image,
	read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler,
	intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_result_t __ovld
	intel_sub_group_avc_ime_evaluate_with_single_reference_streamin(
	read_only image2d_t src_image, read_only image2d_t ref_image,
	sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload,
	intel_sub_group_avc_ime_single_reference_streamin_t streamin_components);
	intel_sub_group_avc_ime_result_t __ovld
	intel_sub_group_avc_ime_evaluate_with_dual_reference_streamin(
	read_only image2d_t src_image, read_only image2d_t fwd_ref_image,
	read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler,
	intel_sub_group_avc_ime_payload_t payload,
	intel_sub_group_avc_ime_dual_reference_streamin_t streamin_components);
	intel_sub_group_avc_ime_result_single_reference_streamout_t __ovld
	intel_sub_group_avc_ime_evaluate_with_single_reference_streaminout(
	read_only image2d_t src_image, read_only image2d_t ref_image,
	sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload,
	intel_sub_group_avc_ime_single_reference_streamin_t streamin_components);
	intel_sub_group_avc_ime_result_dual_reference_streamout_t __ovld
	intel_sub_group_avc_ime_evaluate_with_dual_reference_streaminout(
	read_only image2d_t src_image, read_only image2d_t fwd_ref_image,
	read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler,
	intel_sub_group_avc_ime_payload_t payload,
	intel_sub_group_avc_ime_dual_reference_streamin_t streamin_components);

	intel_sub_group_avc_ime_single_reference_streamin_t __ovld
	intel_sub_group_avc_ime_get_single_reference_streamin(
	intel_sub_group_avc_ime_result_single_reference_streamout_t result);
	intel_sub_group_avc_ime_dual_reference_streamin_t __ovld
	intel_sub_group_avc_ime_get_dual_reference_streamin(
	intel_sub_group_avc_ime_result_dual_reference_streamout_t result);
	intel_sub_group_avc_ime_result_t __ovld
	intel_sub_group_avc_ime_strip_single_reference_streamout(
	intel_sub_group_avc_ime_result_single_reference_streamout_t result);
	intel_sub_group_avc_ime_result_t __ovld
	intel_sub_group_avc_ime_strip_dual_reference_streamout(
	intel_sub_group_avc_ime_result_dual_reference_streamout_t result);

	uint __ovld intel_sub_group_avc_ime_get_streamout_major_shape_motion_vectors(
	intel_sub_group_avc_ime_result_single_reference_streamout_t result,
	uchar major_shape);
	ushort __ovld intel_sub_group_avc_ime_get_streamout_major_shape_distortions(
	intel_sub_group_avc_ime_result_single_reference_streamout_t result,
	uchar major_shape);
	uchar __ovld intel_sub_group_avc_ime_get_streamout_major_shape_reference_ids(
	intel_sub_group_avc_ime_result_single_reference_streamout_t result,
	uchar major_shape);
	uint __ovld intel_sub_group_avc_ime_get_streamout_major_shape_motion_vectors(
	intel_sub_group_avc_ime_result_dual_reference_streamout_t result,
	uchar major_shape, uchar direction);
	ushort __ovld intel_sub_group_avc_ime_get_streamout_major_shape_distortions(
	intel_sub_group_avc_ime_result_dual_reference_streamout_t result,
	uchar major_shape, uchar direction);
	uchar __ovld intel_sub_group_avc_ime_get_streamout_major_shape_reference_ids(
	intel_sub_group_avc_ime_result_dual_reference_streamout_t result,
	uchar major_shape, uchar direction);

	uchar __ovld intel_sub_group_avc_ime_get_border_reached(
	uchar image_select, intel_sub_group_avc_ime_result_t result);
	uchar __ovld intel_sub_group_avc_ime_get_truncated_search_indication(
	intel_sub_group_avc_ime_result_t result);
	uchar __ovld
	intel_sub_group_avc_ime_get_unidirectional_early_search_termination(
	intel_sub_group_avc_ime_result_t result);
	uint __ovld intel_sub_group_avc_ime_get_weighting_pattern_minimum_motion_vector(
	intel_sub_group_avc_ime_result_t result);
	ushort __ovld intel_sub_group_avc_ime_get_weighting_pattern_minimum_distortion(
	intel_sub_group_avc_ime_result_t result);

	// REF built-in functions
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_fme_initialize(
	ushort2 src_coord, ulong motion_vectors, uchar major_shapes,
	uchar minor_shapes, uchar directions, uchar pixel_resolution,
	uchar sad_adjustment);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_bme_initialize(
	ushort2 src_coord, ulong motion_vectors, uchar major_shapes,
	uchar minor_shapes, uchar directions, uchar pixel_resolution,
	uchar bidirectional_weight, uchar sad_adjustment);

	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_bidirectional_mix_disable(
	intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_bilinear_filter_enable(
	intel_sub_group_avc_ref_payload_t payload);

	intel_sub_group_avc_ref_result_t __ovld
	intel_sub_group_avc_ref_evaluate_with_single_reference(
	read_only image2d_t src_image, read_only image2d_t ref_image,
	sampler_t vme_media_sampler, intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_ref_result_t __ovld
	intel_sub_group_avc_ref_evaluate_with_dual_reference(
	read_only image2d_t src_image, read_only image2d_t fwd_ref_image,
	read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler,
	intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_ref_result_t __ovld
	intel_sub_group_avc_ref_evaluate_with_multi_reference(
	read_only image2d_t src_image, uint packed_reference_ids,
	sampler_t vme_media_sampler, intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_ref_result_t __ovld
	intel_sub_group_avc_ref_evaluate_with_multi_reference(
	read_only image2d_t src_image, uint packed_reference_ids,
	uchar packed_reference_field_polarities, sampler_t vme_media_sampler,
	intel_sub_group_avc_ref_payload_t payload);

	// SIC built-in functions
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_initialize(
	ushort2 src_coord);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_configure_skc(
	uint skip_block_partition_type, uint skip_motion_vector_mask,
	ulong motion_vectors, uchar bidirectional_weight, uchar skip_sad_adjustment,
	intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_configure_ipe(
	uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
	uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
	uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
	uchar intra_sad_adjustment, intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_configure_ipe(
	uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
	uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
	uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
	ushort left_edge_chroma_pixels, ushort upper_left_corner_chroma_pixel,
	ushort upper_edge_chroma_pixels, uchar intra_sad_adjustment,
	intel_sub_group_avc_sic_payload_t payload);
	uint __ovld
	intel_sub_group_avc_sic_get_motion_vector_mask(
	uint skip_block_partition_type, uchar direction);

	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_intra_luma_shape_penalty(
	uint packed_shape_cost, intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_intra_luma_mode_cost_function(
	uchar luma_mode_penalty, uint luma_packed_neighbor_modes,
	uint luma_packed_non_dc_penalty, intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_intra_chroma_mode_cost_function(
	uchar chroma_mode_penalty, intel_sub_group_avc_sic_payload_t payload);

	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_skc_bilinear_filter_enable(
	intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_skc_forward_transform_enable(
	ulong packed_sad_coefficients, intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_block_based_raw_skip_sad(
	uchar block_based_skip_type,
	intel_sub_group_avc_sic_payload_t payload);

	intel_sub_group_avc_sic_result_t __ovld
	intel_sub_group_avc_sic_evaluate_ipe(
	read_only image2d_t src_image, sampler_t vme_media_sampler,
	intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_result_t __ovld
	intel_sub_group_avc_sic_evaluate_with_single_reference(
	read_only image2d_t src_image, read_only image2d_t ref_image,
	sampler_t vme_media_sampler, intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_result_t __ovld
	intel_sub_group_avc_sic_evaluate_with_dual_reference(
	read_only image2d_t src_image, read_only image2d_t fwd_ref_image,
	read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler,
	intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_result_t __ovld
	intel_sub_group_avc_sic_evaluate_with_multi_reference(
	read_only image2d_t src_image, uint packed_reference_ids,
	sampler_t vme_media_sampler, intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_result_t __ovld
	intel_sub_group_avc_sic_evaluate_with_multi_reference(
	read_only image2d_t src_image, uint packed_reference_ids,
	uchar packed_reference_field_polarities, sampler_t vme_media_sampler,
	intel_sub_group_avc_sic_payload_t payload);

	uchar __ovld intel_sub_group_avc_sic_get_ipe_luma_shape(
	intel_sub_group_avc_sic_result_t result);
	ushort __ovld intel_sub_group_avc_sic_get_best_ipe_luma_distortion(
	intel_sub_group_avc_sic_result_t result);
	ushort __ovld intel_sub_group_avc_sic_get_best_ipe_chroma_distortion(
	intel_sub_group_avc_sic_result_t result);
	ulong __ovld intel_sub_group_avc_sic_get_packed_ipe_luma_modes(
	intel_sub_group_avc_sic_result_t result);
	uchar __ovld intel_sub_group_avc_sic_get_ipe_chroma_mode(
	intel_sub_group_avc_sic_result_t result);
	uint __ovld intel_sub_group_avc_sic_get_packed_skc_luma_count_threshold(
	intel_sub_group_avc_sic_result_t result);
	ulong __ovld intel_sub_group_avc_sic_get_packed_skc_luma_sum_threshold(
	intel_sub_group_avc_sic_result_t result);
	ushort __ovld intel_sub_group_avc_sic_get_inter_raw_sads(
	intel_sub_group_avc_sic_result_t result);

	// Wrappers
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_inter_base_multi_reference_penalty(
	uchar reference_base_penalty, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_inter_base_multi_reference_penalty(
	uchar reference_base_penalty, intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_inter_base_multi_reference_penalty(
	uchar reference_base_penalty, intel_sub_group_avc_sic_payload_t payload);

	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_inter_shape_penalty(
	ulong packed_shape_cost, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_inter_shape_penalty(
	ulong packed_shape_cost, intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_inter_shape_penalty(
	ulong packed_shape_cost, intel_sub_group_avc_sic_payload_t payload);

	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_inter_direction_penalty(
	uchar direction_cost, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_inter_direction_penalty(
	uchar direction_cost, intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_inter_direction_penalty(
	uchar direction_cost, intel_sub_group_avc_sic_payload_t payload);

	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_motion_vector_cost_function(
	ulong packed_cost_center_delta, uint2 packed_cost_table,
	uchar cost_precision, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_motion_vector_cost_function(
	ulong packed_cost_center_delta, uint2 packed_cost_table,
	uchar cost_precision, intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_motion_vector_cost_function(
	ulong packed_cost_center_delta, uint2 packed_cost_table,
	uchar cost_precision, intel_sub_group_avc_sic_payload_t payload);

	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_source_interlaced_field_polarity(
	uchar src_field_polarity, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_source_interlaced_field_polarity(
	uchar src_field_polarity, intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_source_interlaced_field_polarity(
	uchar src_field_polarity, intel_sub_group_avc_sic_payload_t payload);

	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_single_reference_interlaced_field_polarity(
	uchar ref_field_polarity, intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_single_reference_interlaced_field_polarity(
	uchar ref_field_polarity, intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_single_reference_interlaced_field_polarity(
	uchar ref_field_polarity, intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_dual_reference_interlaced_field_polarities(
	uchar fwd_ref_field_polarity, uchar bwd_ref_field_polarity,
	intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_dual_reference_interlaced_field_polarities(
	uchar fwd_ref_field_polarity, uchar bwd_ref_field_polarity,
	intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_dual_reference_interlaced_field_polarities(
	uchar fwd_ref_field_polarity, uchar bwd_ref_field_polarity,
	intel_sub_group_avc_sic_payload_t payload);

	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_ime_set_ac_only_haar(
	intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_ref_set_ac_only_haar(
	intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_sic_set_ac_only_haar(
	intel_sub_group_avc_sic_payload_t payload);

	ulong __ovld intel_sub_group_avc_ime_get_motion_vectors(
	intel_sub_group_avc_ime_result_t result);
	ulong __ovld intel_sub_group_avc_ref_get_motion_vectors(
	intel_sub_group_avc_ref_result_t result);

	ushort __ovld intel_sub_group_avc_ime_get_inter_distortions(
	intel_sub_group_avc_ime_result_t result);
	ushort __ovld intel_sub_group_avc_ref_get_inter_distortions(
	intel_sub_group_avc_ref_result_t result);
	ushort __ovld intel_sub_group_avc_sic_get_inter_distortions(
	intel_sub_group_avc_sic_result_t result);

	ushort __ovld intel_sub_group_avc_ime_get_best_inter_distortion(
	intel_sub_group_avc_ime_result_t result);
	ushort __ovld intel_sub_group_avc_ref_get_best_inter_distortion(
	intel_sub_group_avc_ref_result_t result);

	uchar __ovld intel_sub_group_avc_ime_get_inter_major_shape(
	intel_sub_group_avc_ime_result_t result);
	uchar __ovld intel_sub_group_avc_ref_get_inter_major_shape(
	intel_sub_group_avc_ref_result_t result);
	uchar __ovld intel_sub_group_avc_ime_get_inter_minor_shapes(
	intel_sub_group_avc_ime_result_t result);
	uchar __ovld intel_sub_group_avc_ref_get_inter_minor_shapes(
	intel_sub_group_avc_ref_result_t result);

	uchar __ovld intel_sub_group_avc_ime_get_inter_directions(
	intel_sub_group_avc_ime_result_t result);
	uchar __ovld intel_sub_group_avc_ref_get_inter_directions(
	intel_sub_group_avc_ref_result_t result);

	uchar __ovld intel_sub_group_avc_ime_get_inter_motion_vector_count(
	intel_sub_group_avc_ime_result_t result);
	uchar __ovld intel_sub_group_avc_ref_get_inter_motion_vector_count(
	intel_sub_group_avc_ref_result_t result);

	uint __ovld intel_sub_group_avc_ime_get_inter_reference_ids(
	intel_sub_group_avc_ime_result_t result);
	uint __ovld intel_sub_group_avc_ref_get_inter_reference_ids(
	intel_sub_group_avc_ref_result_t result);

	uchar __ovld
	intel_sub_group_avc_ime_get_inter_reference_interlaced_field_polarities(
	uint packed_reference_ids, uint packed_reference_parameter_field_polarities,
	intel_sub_group_avc_ime_result_t result);
	uchar __ovld
	intel_sub_group_avc_ref_get_inter_reference_interlaced_field_polarities(
	uint packed_reference_ids, uint packed_reference_parameter_field_polarities,
	intel_sub_group_avc_ref_result_t result);

	// Type conversion functions
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_ime_convert_to_mce_payload(
	intel_sub_group_avc_ime_payload_t payload);
	intel_sub_group_avc_ime_payload_t __ovld
	intel_sub_group_avc_mce_convert_to_ime_payload(
	intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_ref_convert_to_mce_payload(
	intel_sub_group_avc_ref_payload_t payload);
	intel_sub_group_avc_ref_payload_t __ovld
	intel_sub_group_avc_mce_convert_to_ref_payload(
	intel_sub_group_avc_mce_payload_t payload);
	intel_sub_group_avc_mce_payload_t __ovld
	intel_sub_group_avc_sic_convert_to_mce_payload(
	intel_sub_group_avc_sic_payload_t payload);
	intel_sub_group_avc_sic_payload_t __ovld
	intel_sub_group_avc_mce_convert_to_sic_payload(
	intel_sub_group_avc_mce_payload_t payload);

	intel_sub_group_avc_mce_result_t __ovld
	intel_sub_group_avc_ime_convert_to_mce_result(
	intel_sub_group_avc_ime_result_t result);
	intel_sub_group_avc_ime_result_t __ovld
	intel_sub_group_avc_mce_convert_to_ime_result(
	intel_sub_group_avc_mce_result_t result);
	intel_sub_group_avc_mce_result_t __ovld
	intel_sub_group_avc_ref_convert_to_mce_result(
	intel_sub_group_avc_ref_result_t result);
	intel_sub_group_avc_ref_result_t __ovld
	intel_sub_group_avc_mce_convert_to_ref_result(
	intel_sub_group_avc_mce_result_t result);
	intel_sub_group_avc_mce_result_t __ovld
	intel_sub_group_avc_sic_convert_to_mce_result(
	intel_sub_group_avc_sic_result_t result);
	intel_sub_group_avc_sic_result_t __ovld
	intel_sub_group_avc_mce_convert_to_sic_result(
	intel_sub_group_avc_mce_result_t result);
	#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
	#endif // cl_intel_device_side_avc_motion_estimation

	#ifdef cl_amd_media_ops
	uint __ovld amd_bitalign(uint a, uint b, uint c);
	uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
	uint3 __ovld amd_bitalign(uint3 a, uint3 b, uint3 c);
	uint4 __ovld amd_bitalign(uint4 a, uint4 b, uint4 c);
	uint8 __ovld amd_bitalign(uint8 a, uint8 b, uint8 c);
	uint16 __ovld amd_bitalign(uint16 a, uint16 b, uint16 c);

	uint __ovld amd_bytealign(uint a, uint b, uint c);
	uint2 __ovld amd_bytealign(uint2 a, uint2 b, uint2 c);
	uint3 __ovld amd_bytealign(uint3 a, uint3 b, uint3 c);
	uint4 __ovld amd_bytealign(uint4 a, uint4 b, uint4 c);
	uint8 __ovld amd_bytealign(uint8 a, uint8 b, uint8 c);
	uint16 __ovld amd_bytealign(uint16 a, uint16 b, uint16 c);

	uint __ovld amd_lerp(uint a, uint b, uint c);
	uint2 __ovld amd_lerp(uint2 a, uint2 b, uint2 c);
	uint3 __ovld amd_lerp(uint3 a, uint3 b, uint3 c);
	uint4 __ovld amd_lerp(uint4 a, uint4 b, uint4 c);
	uint8 __ovld amd_lerp(uint8 a, uint8 b, uint8 c);
	uint16 __ovld amd_lerp(uint16 a, uint16 b, uint16 c);

	uint __ovld amd_pack(float4 v);

	uint __ovld amd_sad4(uint4 x, uint4 y, uint z);

	uint __ovld amd_sadhi(uint a, uint b, uint c);
	uint2 __ovld amd_sadhi(uint2 a, uint2 b, uint2 c);
	uint3 __ovld amd_sadhi(uint3 a, uint3 b, uint3 c);
	uint4 __ovld amd_sadhi(uint4 a, uint4 b, uint4 c);
	uint8 __ovld amd_sadhi(uint8 a, uint8 b, uint8 c);
	uint16 __ovld amd_sadhi(uint16 a, uint16 b, uint16 c);

	uint __ovld amd_sad(uint a, uint b, uint c);
	uint2 __ovld amd_sad(uint2 a, uint2 b, uint2 c);
	uint3 __ovld amd_sad(uint3 a, uint3 b, uint3 c);
	uint4 __ovld amd_sad(uint4 a, uint4 b, uint4 c);
	uint8 __ovld amd_sad(uint8 a, uint8 b, uint8 c);
	uint16 __ovld amd_sad(uint16 a, uint16 b, uint16 c);

	float __ovld amd_unpack0(uint a);
	float2 __ovld amd_unpack0(uint2 a);
	float3 __ovld amd_unpack0(uint3 a);
	float4 __ovld amd_unpack0(uint4 a);
	float8 __ovld amd_unpack0(uint8 a);
	float16 __ovld amd_unpack0(uint16 a);

	float __ovld amd_unpack1(uint a);
	float2 __ovld amd_unpack1(uint2 a);
	float3 __ovld amd_unpack1(uint3 a);
	float4 __ovld amd_unpack1(uint4 a);
	float8 __ovld amd_unpack1(uint8 a);
	float16 __ovld amd_unpack1(uint16 a);

	float __ovld amd_unpack2(uint a);
	float2 __ovld amd_unpack2(uint2 a);
	float3 __ovld amd_unpack2(uint3 a);
	float4 __ovld amd_unpack2(uint4 a);
	float8 __ovld amd_unpack2(uint8 a);
	float16 __ovld amd_unpack2(uint16 a);

	float __ovld amd_unpack3(uint a);
	float2 __ovld amd_unpack3(uint2 a);
	float3 __ovld amd_unpack3(uint3 a);
	float4 __ovld amd_unpack3(uint4 a);
	float8 __ovld amd_unpack3(uint8 a);
	float16 __ovld amd_unpack3(uint16 a);
	#endif // cl_amd_media_ops

	#ifdef cl_amd_media_ops2
	int __ovld amd_bfe(int src0, uint src1, uint src2);
	int2 __ovld amd_bfe(int2 src0, uint2 src1, uint2 src2);
	int3 __ovld amd_bfe(int3 src0, uint3 src1, uint3 src2);
	int4 __ovld amd_bfe(int4 src0, uint4 src1, uint4 src2);
	int8 __ovld amd_bfe(int8 src0, uint8 src1, uint8 src2);
	int16 __ovld amd_bfe(int16 src0, uint16 src1, uint16 src2);

	uint __ovld amd_bfe(uint src0, uint src1, uint src2);
	uint2 __ovld amd_bfe(uint2 src0, uint2 src1, uint2 src2);
	uint3 __ovld amd_bfe(uint3 src0, uint3 src1, uint3 src2);
	uint4 __ovld amd_bfe(uint4 src0, uint4 src1, uint4 src2);
	uint8 __ovld amd_bfe(uint8 src0, uint8 src1, uint8 src2);
	uint16 __ovld amd_bfe(uint16 src0, uint16 src1, uint16 src2);

	uint __ovld amd_bfm(uint src0, uint src1);
	uint2 __ovld amd_bfm(uint2 src0, uint2 src1);
	uint3 __ovld amd_bfm(uint3 src0, uint3 src1);
	uint4 __ovld amd_bfm(uint4 src0, uint4 src1);
	uint8 __ovld amd_bfm(uint8 src0, uint8 src1);
	uint16 __ovld amd_bfm(uint16 src0, uint16 src1);

	float __ovld amd_max3(float src0, float src1, float src2);
	float2 __ovld amd_max3(float2 src0, float2 src1, float2 src2);
	float3 __ovld amd_max3(float3 src0, float3 src1, float3 src2);
	float4 __ovld amd_max3(float4 src0, float4 src1, float4 src2);
	float8 __ovld amd_max3(float8 src0, float8 src1, float8 src2);
	float16 __ovld amd_max3(float16 src0, float16 src1, float16 src2);

	int __ovld amd_max3(int src0, int src1, int src2);
	int2 __ovld amd_max3(int2 src0, int2 src1, int2 src2);
	int3 __ovld amd_max3(int3 src0, int3 src1, int3 src2);
	int4 __ovld amd_max3(int4 src0, int4 src1, int4 src2);
	int8 __ovld amd_max3(int8 src0, int8 src1, int8 src2);
	int16 __ovld amd_max3(int16 src0, int16 src1, int16 src2);

	uint __ovld amd_max3(uint src0, uint src1, uint src2);
	uint2 __ovld amd_max3(uint2 src0, uint2 src1, uint2 src2);
	uint3 __ovld amd_max3(uint3 src0, uint3 src1, uint3 src2);
	uint4 __ovld amd_max3(uint4 src0, uint4 src1, uint4 src2);
	uint8 __ovld amd_max3(uint8 src0, uint8 src1, uint8 src2);
	uint16 __ovld amd_max3(uint16 src0, uint16 src1, uint16 src2);

	float __ovld amd_median3(float src0, float src1, float src2);
	float2 __ovld amd_median3(float2 src0, float2 src1, float2 src2);
	float3 __ovld amd_median3(float3 src0, float3 src1, float3 src2);
	float4 __ovld amd_median3(float4 src0, float4 src1, float4 src2);
	float8 __ovld amd_median3(float8 src0, float8 src1, float8 src2);
	float16 __ovld amd_median3(float16 src0, float16 src1, float16 src2);

	int __ovld amd_median3(int src0, int src1, int src2);
	int2 __ovld amd_median3(int2 src0, int2 src1, int2 src2);
	int3 __ovld amd_median3(int3 src0, int3 src1, int3 src2);
	int4 __ovld amd_median3(int4 src0, int4 src1, int4 src2);
	int8 __ovld amd_median3(int8 src0, int8 src1, int8 src2);
	int16 __ovld amd_median3(int16 src0, int16 src1, int16 src2);

	uint __ovld amd_median3(uint src0, uint src1, uint src2);
	uint2 __ovld amd_median3(uint2 src0, uint2 src1, uint2 src2);
	uint3 __ovld amd_median3(uint3 src0, uint3 src1, uint3 src2);
	uint4 __ovld amd_median3(uint4 src0, uint4 src1, uint4 src2);
	uint8 __ovld amd_median3(uint8 src0, uint8 src1, uint8 src2);
	uint16 __ovld amd_median3(uint16 src0, uint16 src1, uint16 src2);

	float __ovld amd_min3(float src0, float src1, float src);
	float2 __ovld amd_min3(float2 src0, float2 src1, float2 src);
	float3 __ovld amd_min3(float3 src0, float3 src1, float3 src);
	float4 __ovld amd_min3(float4 src0, float4 src1, float4 src);
	float8 __ovld amd_min3(float8 src0, float8 src1, float8 src);
	float16 __ovld amd_min3(float16 src0, float16 src1, float16 src);

	int __ovld amd_min3(int src0, int src1, int src2);
	int2 __ovld amd_min3(int2 src0, int2 src1, int2 src2);
	int3 __ovld amd_min3(int3 src0, int3 src1, int3 src2);
	int4 __ovld amd_min3(int4 src0, int4 src1, int4 src2);
	int8 __ovld amd_min3(int8 src0, int8 src1, int8 src2);
	int16 __ovld amd_min3(int16 src0, int16 src1, int16 src2);

	uint __ovld amd_min3(uint src0, uint src1, uint src2);
	uint2 __ovld amd_min3(uint2 src0, uint2 src1, uint2 src2);
	uint3 __ovld amd_min3(uint3 src0, uint3 src1, uint3 src2);
	uint4 __ovld amd_min3(uint4 src0, uint4 src1, uint4 src2);
	uint8 __ovld amd_min3(uint8 src0, uint8 src1, uint8 src2);
	uint16 __ovld amd_min3(uint16 src0, uint16 src1, uint16 src2);

	ulong __ovld amd_mqsad(ulong src0, uint src1, ulong src2);
	ulong2 __ovld amd_mqsad(ulong2 src0, uint2 src1, ulong2 src2);
	ulong3 __ovld amd_mqsad(ulong3 src0, uint3 src1, ulong3 src2);
	ulong4 __ovld amd_mqsad(ulong4 src0, uint4 src1, ulong4 src2);
	ulong8 __ovld amd_mqsad(ulong8 src0, uint8 src1, ulong8 src2);
	ulong16 __ovld amd_mqsad(ulong16 src0, uint16 src1, ulong16 src2);

	ulong __ovld amd_qsad(ulong src0, uint src1, ulong src2);
	ulong2 __ovld amd_qsad(ulong2 src0, uint2 src1, ulong2 src2);
	ulong3 __ovld amd_qsad(ulong3 src0, uint3 src1, ulong3 src2);
	ulong4 __ovld amd_qsad(ulong4 src0, uint4 src1, ulong4 src2);
	ulong8 __ovld amd_qsad(ulong8 src0, uint8 src1, ulong8 src2);
	ulong16 __ovld amd_qsad(ulong16 src0, uint16 src1, ulong16 src2);

	uint __ovld amd_msad(uint src0, uint src1, uint src2);
	uint2 __ovld amd_msad(uint2 src0, uint2 src1, uint2 src2);
	uint3 __ovld amd_msad(uint3 src0, uint3 src1, uint3 src2);
	uint4 __ovld amd_msad(uint4 src0, uint4 src1, uint4 src2);
	uint8 __ovld amd_msad(uint8 src0, uint8 src1, uint8 src2);
	uint16 __ovld amd_msad(uint16 src0, uint16 src1, uint16 src2);

	uint __ovld amd_sadd(uint src0, uint src1, uint src2);
	uint2 __ovld amd_sadd(uint2 src0, uint2 src1, uint2 src2);
	uint3 __ovld amd_sadd(uint3 src0, uint3 src1, uint3 src2);
	uint4 __ovld amd_sadd(uint4 src0, uint4 src1, uint4 src2);
	uint8 __ovld amd_sadd(uint8 src0, uint8 src1, uint8 src2);
	uint16 __ovld amd_sadd(uint16 src0, uint16 src1, uint16 src2);

	uint __ovld amd_sadw(uint src0, uint src1, uint src2);
	uint2 __ovld amd_sadw(uint2 src0, uint2 src1, uint2 src2);
	uint3 __ovld amd_sadw(uint3 src0, uint3 src1, uint3 src2);
	uint4 __ovld amd_sadw(uint4 src0, uint4 src1, uint4 src2);
	uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
	uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
	#endif // cl_amd_media_ops2

	#if defined(cl_arm_integer_dot_product_int8)
	#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : begin
	uint __ovld arm_dot(uchar4 a, uchar4 b);
	int __ovld arm_dot(char4 a, char4 b);
	#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : end
	#endif // defined(cl_arm_integer_dot_product_int8)

	#if defined(cl_arm_integer_dot_product_accumulate_int8)
	#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : begin
	uint __ovld arm_dot_acc(uchar4 a, uchar4 b, uint c);
	int __ovld arm_dot_acc(char4 a, char4 b, int c);
	#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : end
	#endif // defined(cl_arm_integer_dot_product_accumulate_int8)

	#if defined(cl_arm_integer_dot_product_accumulate_int16)
	#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int16 : begin
	uint __ovld arm_dot_acc(ushort2 a, ushort2 b, uint c);
	int __ovld arm_dot_acc(short2 a, short2 b, int c);
	#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int16 : end
	#endif // defined(cl_arm_integer_dot_product_accumulate_int16)

	#if defined(cl_arm_integer_dot_product_accumulate_saturate_int8)
	#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_saturate_int8 : begin
	uint __ovld arm_dot_acc_sat(uchar4 a, uchar4 b, uint c);
	int __ovld arm_dot_acc_sat(char4 a, char4 b, int c);
	#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_saturate_int8 : end
	#endif // defined(cl_arm_integer_dot_product_accumulate_saturate_int8)

	// Disable any extensions we may have enabled previously.
	#pragma OPENCL EXTENSION all : disable

	#undef __cnfn
	#undef __ovld
	#endif //_OPENCL_H_
	Index: projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaDeclCXX.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaDeclCXX.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaDeclCXX.cpp (revision 351722)
	@@ -1,15753 +1,15754 @@
	//===------ SemaDeclCXX.cpp - Semantic Analysis for C++ Declarations ------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements semantic analysis for C++ declarations.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/ASTConsumer.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/ASTLambda.h"
	#include "clang/AST/ASTMutationListener.h"
	#include "clang/AST/CXXInheritance.h"
	#include "clang/AST/CharUnits.h"
	#include "clang/AST/ComparisonCategories.h"
	#include "clang/AST/EvaluatedExprVisitor.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/RecordLayout.h"
	#include "clang/AST/RecursiveASTVisitor.h"
	#include "clang/AST/StmtVisitor.h"
	#include "clang/AST/TypeLoc.h"
	#include "clang/AST/TypeOrdering.h"
	#include "clang/Basic/PartialDiagnostic.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Lex/LiteralSupport.h"
	#include "clang/Lex/Preprocessor.h"
	#include "clang/Sema/CXXFieldCollector.h"
	#include "clang/Sema/DeclSpec.h"
	#include "clang/Sema/Initialization.h"
	#include "clang/Sema/Lookup.h"
	#include "clang/Sema/ParsedTemplate.h"
	#include "clang/Sema/Scope.h"
	#include "clang/Sema/ScopeInfo.h"
	#include "clang/Sema/SemaInternal.h"
	#include "clang/Sema/Template.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/StringExtras.h"
	#include <map>
	#include <set>

	using namespace clang;

	//===----------------------------------------------------------------------===//
	// CheckDefaultArgumentVisitor
	//===----------------------------------------------------------------------===//

	namespace {
	/// CheckDefaultArgumentVisitor - C++ [dcl.fct.default] Traverses
	/// the default argument of a parameter to determine whether it
	/// contains any ill-formed subexpressions. For example, this will
	/// diagnose the use of local variables or parameters within the
	/// default argument expression.
	class CheckDefaultArgumentVisitor
	: public StmtVisitor<CheckDefaultArgumentVisitor, bool> {
	Expr *DefaultArg;
	Sema *S;

	public:
	CheckDefaultArgumentVisitor(Expr defarg, Sema s)
	: DefaultArg(defarg), S(s) {}

	bool VisitExpr(Expr *Node);
	bool VisitDeclRefExpr(DeclRefExpr *DRE);
	bool VisitCXXThisExpr(CXXThisExpr *ThisE);
	bool VisitLambdaExpr(LambdaExpr *Lambda);
	bool VisitPseudoObjectExpr(PseudoObjectExpr *POE);
	};

	/// VisitExpr - Visit all of the children of this expression.
	bool CheckDefaultArgumentVisitor::VisitExpr(Expr *Node) {
	bool IsInvalid = false;
	for (Stmt *SubStmt : Node->children())
	IsInvalid \|= Visit(SubStmt);
	return IsInvalid;
	}

	/// VisitDeclRefExpr - Visit a reference to a declaration, to
	/// determine whether this declaration can be used in the default
	/// argument expression.
	bool CheckDefaultArgumentVisitor::VisitDeclRefExpr(DeclRefExpr *DRE) {
	NamedDecl *Decl = DRE->getDecl();
	if (ParmVarDecl *Param = dyn_cast<ParmVarDecl>(Decl)) {
	// C++ [dcl.fct.default]p9
	// Default arguments are evaluated each time the function is
	// called. The order of evaluation of function arguments is
	// unspecified. Consequently, parameters of a function shall not
	// be used in default argument expressions, even if they are not
	// evaluated. Parameters of a function declared before a default
	// argument expression are in scope and can hide namespace and
	// class member names.
	return S->Diag(DRE->getBeginLoc(),
	diag::err_param_default_argument_references_param)
	<< Param->getDeclName() << DefaultArg->getSourceRange();
	} else if (VarDecl *VDecl = dyn_cast<VarDecl>(Decl)) {
	// C++ [dcl.fct.default]p7
	// Local variables shall not be used in default argument
	// expressions.
	if (VDecl->isLocalVarDecl())
	return S->Diag(DRE->getBeginLoc(),
	diag::err_param_default_argument_references_local)
	<< VDecl->getDeclName() << DefaultArg->getSourceRange();
	}

	return false;
	}

	/// VisitCXXThisExpr - Visit a C++ "this" expression.
	bool CheckDefaultArgumentVisitor::VisitCXXThisExpr(CXXThisExpr *ThisE) {
	// C++ [dcl.fct.default]p8:
	// The keyword this shall not be used in a default argument of a
	// member function.
	return S->Diag(ThisE->getBeginLoc(),
	diag::err_param_default_argument_references_this)
	<< ThisE->getSourceRange();
	}

	bool CheckDefaultArgumentVisitor::VisitPseudoObjectExpr(PseudoObjectExpr *POE) {
	bool Invalid = false;
	for (PseudoObjectExpr::semantics_iterator
	i = POE->semantics_begin(), e = POE->semantics_end(); i != e; ++i) {
	Expr E = i;

	// Look through bindings.
	if (OpaqueValueExpr *OVE = dyn_cast<OpaqueValueExpr>(E)) {
	E = OVE->getSourceExpr();
	assert(E && "pseudo-object binding without source expression?");
	}

	Invalid \|= Visit(E);
	}
	return Invalid;
	}

	bool CheckDefaultArgumentVisitor::VisitLambdaExpr(LambdaExpr *Lambda) {
	// C++11 [expr.lambda.prim]p13:
	// A lambda-expression appearing in a default argument shall not
	// implicitly or explicitly capture any entity.
	if (Lambda->capture_begin() == Lambda->capture_end())
	return false;

	return S->Diag(Lambda->getBeginLoc(), diag::err_lambda_capture_default_arg);
	}
	}

	void
	Sema::ImplicitExceptionSpecification::CalledDecl(SourceLocation CallLoc,
	const CXXMethodDecl *Method) {
	// If we have an MSAny spec already, don't bother.
	if (!Method \|\| ComputedEST == EST_MSAny)
	return;

	const FunctionProtoType *Proto
	= Method->getType()->getAs<FunctionProtoType>();
	Proto = Self->ResolveExceptionSpec(CallLoc, Proto);
	if (!Proto)
	return;

	ExceptionSpecificationType EST = Proto->getExceptionSpecType();

	// If we have a throw-all spec at this point, ignore the function.
	if (ComputedEST == EST_None)
	return;

	if (EST == EST_None && Method->hasAttr<NoThrowAttr>())
	EST = EST_BasicNoexcept;

	switch (EST) {
	case EST_Unparsed:
	case EST_Uninstantiated:
	case EST_Unevaluated:
	llvm_unreachable("should not see unresolved exception specs here");

	// If this function can throw any exceptions, make a note of that.
	case EST_MSAny:
	case EST_None:
	// FIXME: Whichever we see last of MSAny and None determines our result.
	// We should make a consistent, order-independent choice here.
	ClearExceptions();
	ComputedEST = EST;
	return;
	case EST_NoexceptFalse:
	ClearExceptions();
	ComputedEST = EST_None;
	return;
	// FIXME: If the call to this decl is using any of its default arguments, we
	// need to search them for potentially-throwing calls.
	// If this function has a basic noexcept, it doesn't affect the outcome.
	case EST_BasicNoexcept:
	case EST_NoexceptTrue:
	case EST_NoThrow:
	return;
	// If we're still at noexcept(true) and there's a throw() callee,
	// change to that specification.
	case EST_DynamicNone:
	if (ComputedEST == EST_BasicNoexcept)
	ComputedEST = EST_DynamicNone;
	return;
	case EST_DependentNoexcept:
	llvm_unreachable(
	"should not generate implicit declarations for dependent cases");
	case EST_Dynamic:
	break;
	}
	assert(EST == EST_Dynamic && "EST case not considered earlier.");
	assert(ComputedEST != EST_None &&
	"Shouldn't collect exceptions when throw-all is guaranteed.");
	ComputedEST = EST_Dynamic;
	// Record the exceptions in this function's exception specification.
	for (const auto &E : Proto->exceptions())
	if (ExceptionsSeen.insert(Self->Context.getCanonicalType(E)).second)
	Exceptions.push_back(E);
	}

	void Sema::ImplicitExceptionSpecification::CalledExpr(Expr *E) {
	if (!E \|\| ComputedEST == EST_MSAny)
	return;

	// FIXME:
	//
	// C++0x [except.spec]p14:
	// [An] implicit exception-specification specifies the type-id T if and
	// only if T is allowed by the exception-specification of a function directly
	// invoked by f's implicit definition; f shall allow all exceptions if any
	// function it directly invokes allows all exceptions, and f shall allow no
	// exceptions if every function it directly invokes allows no exceptions.
	//
	// Note in particular that if an implicit exception-specification is generated
	// for a function containing a throw-expression, that specification can still
	// be noexcept(true).
	//
	// Note also that 'directly invoked' is not defined in the standard, and there
	// is no indication that we should only consider potentially-evaluated calls.
	//
	// Ultimately we should implement the intent of the standard: the exception
	// specification should be the set of exceptions which can be thrown by the
	// implicit definition. For now, we assume that any non-nothrow expression can
	// throw any exception.

	if (Self->canThrow(E))
	ComputedEST = EST_None;
	}

	bool
	Sema::SetParamDefaultArgument(ParmVarDecl Param, Expr Arg,
	SourceLocation EqualLoc) {
	if (RequireCompleteType(Param->getLocation(), Param->getType(),
	diag::err_typecheck_decl_incomplete_type)) {
	Param->setInvalidDecl();
	return true;
	}

	// C++ [dcl.fct.default]p5
	// A default argument expression is implicitly converted (clause
	// 4) to the parameter type. The default argument expression has
	// the same semantic constraints as the initializer expression in
	// a declaration of a variable of the parameter type, using the
	// copy-initialization semantics (8.5).
	InitializedEntity Entity = InitializedEntity::InitializeParameter(Context,
	Param);
	InitializationKind Kind = InitializationKind::CreateCopy(Param->getLocation(),
	EqualLoc);
	InitializationSequence InitSeq(*this, Entity, Kind, Arg);
	ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Arg);
	if (Result.isInvalid())
	return true;
	Arg = Result.getAs<Expr>();

	CheckCompletedExpr(Arg, EqualLoc);
	Arg = MaybeCreateExprWithCleanups(Arg);

	// Okay: add the default argument to the parameter
	Param->setDefaultArg(Arg);

	// We have already instantiated this parameter; provide each of the
	// instantiations with the uninstantiated default argument.
	UnparsedDefaultArgInstantiationsMap::iterator InstPos
	= UnparsedDefaultArgInstantiations.find(Param);
	if (InstPos != UnparsedDefaultArgInstantiations.end()) {
	for (unsigned I = 0, N = InstPos->second.size(); I != N; ++I)
	InstPos->second[I]->setUninstantiatedDefaultArg(Arg);

	// We're done tracking this parameter's instantiations.
	UnparsedDefaultArgInstantiations.erase(InstPos);
	}

	return false;
	}

	/// ActOnParamDefaultArgument - Check whether the default argument
	/// provided for a function parameter is well-formed. If so, attach it
	/// to the parameter declaration.
	void
	Sema::ActOnParamDefaultArgument(Decl *param, SourceLocation EqualLoc,
	Expr *DefaultArg) {
	if (!param \|\| !DefaultArg)
	return;

	ParmVarDecl *Param = cast<ParmVarDecl>(param);
	UnparsedDefaultArgLocs.erase(Param);

	// Default arguments are only permitted in C++
	if (!getLangOpts().CPlusPlus) {
	Diag(EqualLoc, diag::err_param_default_argument)
	<< DefaultArg->getSourceRange();
	Param->setInvalidDecl();
	return;
	}

	// Check for unexpanded parameter packs.
	if (DiagnoseUnexpandedParameterPack(DefaultArg, UPPC_DefaultArgument)) {
	Param->setInvalidDecl();
	return;
	}

	// C++11 [dcl.fct.default]p3
	// A default argument expression [...] shall not be specified for a
	// parameter pack.
	if (Param->isParameterPack()) {
	Diag(EqualLoc, diag::err_param_default_argument_on_parameter_pack)
	<< DefaultArg->getSourceRange();
	return;
	}

	// Check that the default argument is well-formed
	CheckDefaultArgumentVisitor DefaultArgChecker(DefaultArg, this);
	if (DefaultArgChecker.Visit(DefaultArg)) {
	Param->setInvalidDecl();
	return;
	}

	SetParamDefaultArgument(Param, DefaultArg, EqualLoc);
	}

	/// ActOnParamUnparsedDefaultArgument - We've seen a default
	/// argument for a function parameter, but we can't parse it yet
	/// because we're inside a class definition. Note that this default
	/// argument will be parsed later.
	void Sema::ActOnParamUnparsedDefaultArgument(Decl *param,
	SourceLocation EqualLoc,
	SourceLocation ArgLoc) {
	if (!param)
	return;

	ParmVarDecl *Param = cast<ParmVarDecl>(param);
	Param->setUnparsedDefaultArg();
	UnparsedDefaultArgLocs[Param] = ArgLoc;
	}

	/// ActOnParamDefaultArgumentError - Parsing or semantic analysis of
	/// the default argument for the parameter param failed.
	void Sema::ActOnParamDefaultArgumentError(Decl *param,
	SourceLocation EqualLoc) {
	if (!param)
	return;

	ParmVarDecl *Param = cast<ParmVarDecl>(param);
	Param->setInvalidDecl();
	UnparsedDefaultArgLocs.erase(Param);
	Param->setDefaultArg(new(Context)
	OpaqueValueExpr(EqualLoc,
	Param->getType().getNonReferenceType(),
	VK_RValue));
	}

	/// CheckExtraCXXDefaultArguments - Check for any extra default
	/// arguments in the declarator, which is not a function declaration
	/// or definition and therefore is not permitted to have default
	/// arguments. This routine should be invoked for every declarator
	/// that is not a function declaration or definition.
	void Sema::CheckExtraCXXDefaultArguments(Declarator &D) {
	// C++ [dcl.fct.default]p3
	// A default argument expression shall be specified only in the
	// parameter-declaration-clause of a function declaration or in a
	// template-parameter (14.1). It shall not be specified for a
	// parameter pack. If it is specified in a
	// parameter-declaration-clause, it shall not occur within a
	// declarator or abstract-declarator of a parameter-declaration.
	bool MightBeFunction = D.isFunctionDeclarationContext();
	for (unsigned i = 0, e = D.getNumTypeObjects(); i != e; ++i) {
	DeclaratorChunk &chunk = D.getTypeObject(i);
	if (chunk.Kind == DeclaratorChunk::Function) {
	if (MightBeFunction) {
	// This is a function declaration. It can have default arguments, but
	// keep looking in case its return type is a function type with default
	// arguments.
	MightBeFunction = false;
	continue;
	}
	for (unsigned argIdx = 0, e = chunk.Fun.NumParams; argIdx != e;
	++argIdx) {
	ParmVarDecl *Param = cast<ParmVarDecl>(chunk.Fun.Params[argIdx].Param);
	if (Param->hasUnparsedDefaultArg()) {
	std::unique_ptr<CachedTokens> Toks =
	std::move(chunk.Fun.Params[argIdx].DefaultArgTokens);
	SourceRange SR;
	if (Toks->size() > 1)
	SR = SourceRange((*Toks)[1].getLocation(),
	Toks->back().getLocation());
	else
	SR = UnparsedDefaultArgLocs[Param];
	Diag(Param->getLocation(), diag::err_param_default_argument_nonfunc)
	<< SR;
	} else if (Param->getDefaultArg()) {
	Diag(Param->getLocation(), diag::err_param_default_argument_nonfunc)
	<< Param->getDefaultArg()->getSourceRange();
	Param->setDefaultArg(nullptr);
	}
	}
	} else if (chunk.Kind != DeclaratorChunk::Paren) {
	MightBeFunction = false;
	}
	}
	}

	static bool functionDeclHasDefaultArgument(const FunctionDecl *FD) {
	for (unsigned NumParams = FD->getNumParams(); NumParams > 0; --NumParams) {
	const ParmVarDecl *PVD = FD->getParamDecl(NumParams-1);
	if (!PVD->hasDefaultArg())
	return false;
	if (!PVD->hasInheritedDefaultArg())
	return true;
	}
	return false;
	}

	/// MergeCXXFunctionDecl - Merge two declarations of the same C++
	/// function, once we already know that they have the same
	/// type. Subroutine of MergeFunctionDecl. Returns true if there was an
	/// error, false otherwise.
	bool Sema::MergeCXXFunctionDecl(FunctionDecl New, FunctionDecl Old,
	Scope *S) {
	bool Invalid = false;

	// The declaration context corresponding to the scope is the semantic
	// parent, unless this is a local function declaration, in which case
	// it is that surrounding function.
	DeclContext *ScopeDC = New->isLocalExternDecl()
	? New->getLexicalDeclContext()
	: New->getDeclContext();

	// Find the previous declaration for the purpose of default arguments.
	FunctionDecl *PrevForDefaultArgs = Old;
	for (/**/; PrevForDefaultArgs;
	// Don't bother looking back past the latest decl if this is a local
	// extern declaration; nothing else could work.
	PrevForDefaultArgs = New->isLocalExternDecl()
	? nullptr
	: PrevForDefaultArgs->getPreviousDecl()) {
	// Ignore hidden declarations.
	if (!LookupResult::isVisible(*this, PrevForDefaultArgs))
	continue;

	if (S && !isDeclInScope(PrevForDefaultArgs, ScopeDC, S) &&
	!New->isCXXClassMember()) {
	// Ignore default arguments of old decl if they are not in
	// the same scope and this is not an out-of-line definition of
	// a member function.
	continue;
	}

	if (PrevForDefaultArgs->isLocalExternDecl() != New->isLocalExternDecl()) {
	// If only one of these is a local function declaration, then they are
	// declared in different scopes, even though isDeclInScope may think
	// they're in the same scope. (If both are local, the scope check is
	// sufficient, and if neither is local, then they are in the same scope.)
	continue;
	}

	// We found the right previous declaration.
	break;
	}

	// C++ [dcl.fct.default]p4:
	// For non-template functions, default arguments can be added in
	// later declarations of a function in the same
	// scope. Declarations in different scopes have completely
	// distinct sets of default arguments. That is, declarations in
	// inner scopes do not acquire default arguments from
	// declarations in outer scopes, and vice versa. In a given
	// function declaration, all parameters subsequent to a
	// parameter with a default argument shall have default
	// arguments supplied in this or previous declarations. A
	// default argument shall not be redefined by a later
	// declaration (not even to the same value).
	//
	// C++ [dcl.fct.default]p6:
	// Except for member functions of class templates, the default arguments
	// in a member function definition that appears outside of the class
	// definition are added to the set of default arguments provided by the
	// member function declaration in the class definition.
	for (unsigned p = 0, NumParams = PrevForDefaultArgs
	? PrevForDefaultArgs->getNumParams()
	: 0;
	p < NumParams; ++p) {
	ParmVarDecl *OldParam = PrevForDefaultArgs->getParamDecl(p);
	ParmVarDecl *NewParam = New->getParamDecl(p);

	bool OldParamHasDfl = OldParam ? OldParam->hasDefaultArg() : false;
	bool NewParamHasDfl = NewParam->hasDefaultArg();

	if (OldParamHasDfl && NewParamHasDfl) {
	unsigned DiagDefaultParamID =
	diag::err_param_default_argument_redefinition;

	// MSVC accepts that default parameters be redefined for member functions
	// of template class. The new default parameter's value is ignored.
	Invalid = true;
	if (getLangOpts().MicrosoftExt) {
	CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(New);
	if (MD && MD->getParent()->getDescribedClassTemplate()) {
	// Merge the old default argument into the new parameter.
	NewParam->setHasInheritedDefaultArg();
	if (OldParam->hasUninstantiatedDefaultArg())
	NewParam->setUninstantiatedDefaultArg(
	OldParam->getUninstantiatedDefaultArg());
	else
	NewParam->setDefaultArg(OldParam->getInit());
	DiagDefaultParamID = diag::ext_param_default_argument_redefinition;
	Invalid = false;
	}
	}

	// FIXME: If we knew where the '=' was, we could easily provide a fix-it
	// hint here. Alternatively, we could walk the type-source information
	// for NewParam to find the last source location in the type... but it
	// isn't worth the effort right now. This is the kind of test case that
	// is hard to get right:
	// int f(int);
	// void g(int (*fp)(int) = f);
	// void g(int (*fp)(int) = &f);
	Diag(NewParam->getLocation(), DiagDefaultParamID)
	<< NewParam->getDefaultArgRange();

	// Look for the function declaration where the default argument was
	// actually written, which may be a declaration prior to Old.
	for (auto Older = PrevForDefaultArgs;
	OldParam->hasInheritedDefaultArg(); /**/) {
	Older = Older->getPreviousDecl();
	OldParam = Older->getParamDecl(p);
	}

	Diag(OldParam->getLocation(), diag::note_previous_definition)
	<< OldParam->getDefaultArgRange();
	} else if (OldParamHasDfl) {
	// Merge the old default argument into the new parameter unless the new
	// function is a friend declaration in a template class. In the latter
	// case the default arguments will be inherited when the friend
	// declaration will be instantiated.
	if (New->getFriendObjectKind() == Decl::FOK_None \|\|
	!New->getLexicalDeclContext()->isDependentContext()) {
	// It's important to use getInit() here; getDefaultArg()
	// strips off any top-level ExprWithCleanups.
	NewParam->setHasInheritedDefaultArg();
	if (OldParam->hasUnparsedDefaultArg())
	NewParam->setUnparsedDefaultArg();
	else if (OldParam->hasUninstantiatedDefaultArg())
	NewParam->setUninstantiatedDefaultArg(
	OldParam->getUninstantiatedDefaultArg());
	else
	NewParam->setDefaultArg(OldParam->getInit());
	}
	} else if (NewParamHasDfl) {
	if (New->getDescribedFunctionTemplate()) {
	// Paragraph 4, quoted above, only applies to non-template functions.
	Diag(NewParam->getLocation(),
	diag::err_param_default_argument_template_redecl)
	<< NewParam->getDefaultArgRange();
	Diag(PrevForDefaultArgs->getLocation(),
	diag::note_template_prev_declaration)
	<< false;
	} else if (New->getTemplateSpecializationKind()
	!= TSK_ImplicitInstantiation &&
	New->getTemplateSpecializationKind() != TSK_Undeclared) {
	// C++ [temp.expr.spec]p21:
	// Default function arguments shall not be specified in a declaration
	// or a definition for one of the following explicit specializations:
	// - the explicit specialization of a function template;
	// - the explicit specialization of a member function template;
	// - the explicit specialization of a member function of a class
	// template where the class template specialization to which the
	// member function specialization belongs is implicitly
	// instantiated.
	Diag(NewParam->getLocation(), diag::err_template_spec_default_arg)
	<< (New->getTemplateSpecializationKind() ==TSK_ExplicitSpecialization)
	<< New->getDeclName()
	<< NewParam->getDefaultArgRange();
	} else if (New->getDeclContext()->isDependentContext()) {
	// C++ [dcl.fct.default]p6 (DR217):
	// Default arguments for a member function of a class template shall
	// be specified on the initial declaration of the member function
	// within the class template.
	//
	// Reading the tea leaves a bit in DR217 and its reference to DR205
	// leads me to the conclusion that one cannot add default function
	// arguments for an out-of-line definition of a member function of a
	// dependent type.
	int WhichKind = 2;
	if (CXXRecordDecl *Record
	= dyn_cast<CXXRecordDecl>(New->getDeclContext())) {
	if (Record->getDescribedClassTemplate())
	WhichKind = 0;
	else if (isa<ClassTemplatePartialSpecializationDecl>(Record))
	WhichKind = 1;
	else
	WhichKind = 2;
	}

	Diag(NewParam->getLocation(),
	diag::err_param_default_argument_member_template_redecl)
	<< WhichKind
	<< NewParam->getDefaultArgRange();
	}
	}
	}

	// DR1344: If a default argument is added outside a class definition and that
	// default argument makes the function a special member function, the program
	// is ill-formed. This can only happen for constructors.
	if (isa<CXXConstructorDecl>(New) &&
	New->getMinRequiredArguments() < Old->getMinRequiredArguments()) {
	CXXSpecialMember NewSM = getSpecialMember(cast<CXXMethodDecl>(New)),
	OldSM = getSpecialMember(cast<CXXMethodDecl>(Old));
	if (NewSM != OldSM) {
	ParmVarDecl *NewParam = New->getParamDecl(New->getMinRequiredArguments());
	assert(NewParam->hasDefaultArg());
	Diag(NewParam->getLocation(), diag::err_default_arg_makes_ctor_special)
	<< NewParam->getDefaultArgRange() << NewSM;
	Diag(Old->getLocation(), diag::note_previous_declaration);
	}
	}

	const FunctionDecl *Def;
	// C++11 [dcl.constexpr]p1: If any declaration of a function or function
	// template has a constexpr specifier then all its declarations shall
	// contain the constexpr specifier.
	if (New->getConstexprKind() != Old->getConstexprKind()) {
	Diag(New->getLocation(), diag::err_constexpr_redecl_mismatch)
	<< New << New->getConstexprKind() << Old->getConstexprKind();
	Diag(Old->getLocation(), diag::note_previous_declaration);
	Invalid = true;
	} else if (!Old->getMostRecentDecl()->isInlined() && New->isInlined() &&
	Old->isDefined(Def) &&
	// If a friend function is inlined but does not have 'inline'
	// specifier, it is a definition. Do not report attribute conflict
	// in this case, redefinition will be diagnosed later.
	(New->isInlineSpecified() \|\|
	New->getFriendObjectKind() == Decl::FOK_None)) {
	// C++11 [dcl.fcn.spec]p4:
	// If the definition of a function appears in a translation unit before its
	// first declaration as inline, the program is ill-formed.
	Diag(New->getLocation(), diag::err_inline_decl_follows_def) << New;
	Diag(Def->getLocation(), diag::note_previous_definition);
	Invalid = true;
	}

	// C++17 [temp.deduct.guide]p3:
	// Two deduction guide declarations in the same translation unit
	// for the same class template shall not have equivalent
	// parameter-declaration-clauses.
	if (isa<CXXDeductionGuideDecl>(New) &&
	!New->isFunctionTemplateSpecialization()) {
	Diag(New->getLocation(), diag::err_deduction_guide_redeclared);
	Diag(Old->getLocation(), diag::note_previous_declaration);
	}

	// C++11 [dcl.fct.default]p4: If a friend declaration specifies a default
	// argument expression, that declaration shall be a definition and shall be
	// the only declaration of the function or function template in the
	// translation unit.
	if (Old->getFriendObjectKind() == Decl::FOK_Undeclared &&
	functionDeclHasDefaultArgument(Old)) {
	Diag(New->getLocation(), diag::err_friend_decl_with_def_arg_redeclared);
	Diag(Old->getLocation(), diag::note_previous_declaration);
	Invalid = true;
	}

	return Invalid;
	}

	NamedDecl *
	Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D,
	MultiTemplateParamsArg TemplateParamLists) {
	assert(D.isDecompositionDeclarator());
	const DecompositionDeclarator &Decomp = D.getDecompositionDeclarator();

	// The syntax only allows a decomposition declarator as a simple-declaration,
	// a for-range-declaration, or a condition in Clang, but we parse it in more
	// cases than that.
	if (!D.mayHaveDecompositionDeclarator()) {
	Diag(Decomp.getLSquareLoc(), diag::err_decomp_decl_context)
	<< Decomp.getSourceRange();
	return nullptr;
	}

	if (!TemplateParamLists.empty()) {
	// FIXME: There's no rule against this, but there are also no rules that
	// would actually make it usable, so we reject it for now.
	Diag(TemplateParamLists.front()->getTemplateLoc(),
	diag::err_decomp_decl_template);
	return nullptr;
	}

	Diag(Decomp.getLSquareLoc(),
	!getLangOpts().CPlusPlus17
	? diag::ext_decomp_decl
	: D.getContext() == DeclaratorContext::ConditionContext
	? diag::ext_decomp_decl_cond
	: diag::warn_cxx14_compat_decomp_decl)
	<< Decomp.getSourceRange();

	// The semantic context is always just the current context.
	DeclContext *const DC = CurContext;

	// C++17 [dcl.dcl]/8:
	// The decl-specifier-seq shall contain only the type-specifier auto
	// and cv-qualifiers.
	// C++2a [dcl.dcl]/8:
	// If decl-specifier-seq contains any decl-specifier other than static,
	// thread_local, auto, or cv-qualifiers, the program is ill-formed.
	auto &DS = D.getDeclSpec();
	{
	SmallVector<StringRef, 8> BadSpecifiers;
	SmallVector<SourceLocation, 8> BadSpecifierLocs;
	SmallVector<StringRef, 8> CPlusPlus20Specifiers;
	SmallVector<SourceLocation, 8> CPlusPlus20SpecifierLocs;
	if (auto SCS = DS.getStorageClassSpec()) {
	if (SCS == DeclSpec::SCS_static) {
	CPlusPlus20Specifiers.push_back(DeclSpec::getSpecifierName(SCS));
	CPlusPlus20SpecifierLocs.push_back(DS.getStorageClassSpecLoc());
	} else {
	BadSpecifiers.push_back(DeclSpec::getSpecifierName(SCS));
	BadSpecifierLocs.push_back(DS.getStorageClassSpecLoc());
	}
	}
	if (auto TSCS = DS.getThreadStorageClassSpec()) {
	CPlusPlus20Specifiers.push_back(DeclSpec::getSpecifierName(TSCS));
	CPlusPlus20SpecifierLocs.push_back(DS.getThreadStorageClassSpecLoc());
	}
	if (DS.hasConstexprSpecifier()) {
	BadSpecifiers.push_back(
	DeclSpec::getSpecifierName(DS.getConstexprSpecifier()));
	BadSpecifierLocs.push_back(DS.getConstexprSpecLoc());
	}
	if (DS.isInlineSpecified()) {
	BadSpecifiers.push_back("inline");
	BadSpecifierLocs.push_back(DS.getInlineSpecLoc());
	}
	if (!BadSpecifiers.empty()) {
	auto &&Err = Diag(BadSpecifierLocs.front(), diag::err_decomp_decl_spec);
	Err << (int)BadSpecifiers.size()
	<< llvm::join(BadSpecifiers.begin(), BadSpecifiers.end(), " ");
	// Don't add FixItHints to remove the specifiers; we do still respect
	// them when building the underlying variable.
	for (auto Loc : BadSpecifierLocs)
	Err << SourceRange(Loc, Loc);
	} else if (!CPlusPlus20Specifiers.empty()) {
	auto &&Warn = Diag(CPlusPlus20SpecifierLocs.front(),
	getLangOpts().CPlusPlus2a
	? diag::warn_cxx17_compat_decomp_decl_spec
	: diag::ext_decomp_decl_spec);
	Warn << (int)CPlusPlus20Specifiers.size()
	<< llvm::join(CPlusPlus20Specifiers.begin(),
	CPlusPlus20Specifiers.end(), " ");
	for (auto Loc : CPlusPlus20SpecifierLocs)
	Warn << SourceRange(Loc, Loc);
	}
	// We can't recover from it being declared as a typedef.
	if (DS.getStorageClassSpec() == DeclSpec::SCS_typedef)
	return nullptr;
	}

	TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
	QualType R = TInfo->getType();

	if (DiagnoseUnexpandedParameterPack(D.getIdentifierLoc(), TInfo,
	UPPC_DeclarationType))
	D.setInvalidType();

	// The syntax only allows a single ref-qualifier prior to the decomposition
	// declarator. No other declarator chunks are permitted. Also check the type
	// specifier here.
	if (DS.getTypeSpecType() != DeclSpec::TST_auto \|\|
	D.hasGroupingParens() \|\| D.getNumTypeObjects() > 1 \|\|
	(D.getNumTypeObjects() == 1 &&
	D.getTypeObject(0).Kind != DeclaratorChunk::Reference)) {
	Diag(Decomp.getLSquareLoc(),
	(D.hasGroupingParens() \|\|
	(D.getNumTypeObjects() &&
	D.getTypeObject(0).Kind == DeclaratorChunk::Paren))
	? diag::err_decomp_decl_parens
	: diag::err_decomp_decl_type)
	<< R;

	// In most cases, there's no actual problem with an explicitly-specified
	// type, but a function type won't work here, and ActOnVariableDeclarator
	// shouldn't be called for such a type.
	if (R->isFunctionType())
	D.setInvalidType();
	}

	// Build the BindingDecls.
	SmallVector<BindingDecl*, 8> Bindings;

	// Build the BindingDecls.
	for (auto &B : D.getDecompositionDeclarator().bindings()) {
	// Check for name conflicts.
	DeclarationNameInfo NameInfo(B.Name, B.NameLoc);
	LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
	ForVisibleRedeclaration);
	LookupName(Previous, S,
	/CreateBuiltins/DC->getRedeclContext()->isTranslationUnit());

	// It's not permitted to shadow a template parameter name.
	if (Previous.isSingleResult() &&
	Previous.getFoundDecl()->isTemplateParameter()) {
	DiagnoseTemplateParameterShadow(D.getIdentifierLoc(),
	Previous.getFoundDecl());
	Previous.clear();
	}

	bool ConsiderLinkage = DC->isFunctionOrMethod() &&
	DS.getStorageClassSpec() == DeclSpec::SCS_extern;
	FilterLookupForScope(Previous, DC, S, ConsiderLinkage,
	/AllowInlineNamespace/false);
	if (!Previous.empty()) {
	auto *Old = Previous.getRepresentativeDecl();
	Diag(B.NameLoc, diag::err_redefinition) << B.Name;
	Diag(Old->getLocation(), diag::note_previous_definition);
	}

	auto *BD = BindingDecl::Create(Context, DC, B.NameLoc, B.Name);
	PushOnScopeChains(BD, S, true);
	Bindings.push_back(BD);
	ParsingInitForAutoVars.insert(BD);
	}

	// There are no prior lookup results for the variable itself, because it
	// is unnamed.
	DeclarationNameInfo NameInfo((IdentifierInfo *)nullptr,
	Decomp.getLSquareLoc());
	LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
	ForVisibleRedeclaration);

	// Build the variable that holds the non-decomposed object.
	bool AddToScope = true;
	NamedDecl *New =
	ActOnVariableDeclarator(S, D, DC, TInfo, Previous,
	MultiTemplateParamsArg(), AddToScope, Bindings);
	if (AddToScope) {
	S->AddDecl(New);
	CurContext->addHiddenDecl(New);
	}

	if (isInOpenMPDeclareTargetContext())
	checkDeclIsAllowedInOpenMPTarget(nullptr, New);

	return New;
	}

	static bool checkSimpleDecomposition(
	Sema &S, ArrayRef<BindingDecl > Bindings, ValueDecl Src,
	QualType DecompType, const llvm::APSInt &NumElems, QualType ElemType,
	llvm::function_ref<ExprResult(SourceLocation, Expr *, unsigned)> GetInit) {
	if ((int64_t)Bindings.size() != NumElems) {
	S.Diag(Src->getLocation(), diag::err_decomp_decl_wrong_number_bindings)
	<< DecompType << (unsigned)Bindings.size() << NumElems.toString(10)
	<< (NumElems < Bindings.size());
	return true;
	}

	unsigned I = 0;
	for (auto *B : Bindings) {
	SourceLocation Loc = B->getLocation();
	ExprResult E = S.BuildDeclRefExpr(Src, DecompType, VK_LValue, Loc);
	if (E.isInvalid())
	return true;
	E = GetInit(Loc, E.get(), I++);
	if (E.isInvalid())
	return true;
	B->setBinding(ElemType, E.get());
	}

	return false;
	}

	static bool checkArrayLikeDecomposition(Sema &S,
	ArrayRef<BindingDecl *> Bindings,
	ValueDecl *Src, QualType DecompType,
	const llvm::APSInt &NumElems,
	QualType ElemType) {
	return checkSimpleDecomposition(
	S, Bindings, Src, DecompType, NumElems, ElemType,
	[&](SourceLocation Loc, Expr *Base, unsigned I) -> ExprResult {
	ExprResult E = S.ActOnIntegerConstant(Loc, I);
	if (E.isInvalid())
	return ExprError();
	return S.CreateBuiltinArraySubscriptExpr(Base, Loc, E.get(), Loc);
	});
	}

	static bool checkArrayDecomposition(Sema &S, ArrayRef<BindingDecl*> Bindings,
	ValueDecl *Src, QualType DecompType,
	const ConstantArrayType *CAT) {
	return checkArrayLikeDecomposition(S, Bindings, Src, DecompType,
	llvm::APSInt(CAT->getSize()),
	CAT->getElementType());
	}

	static bool checkVectorDecomposition(Sema &S, ArrayRef<BindingDecl*> Bindings,
	ValueDecl *Src, QualType DecompType,
	const VectorType *VT) {
	return checkArrayLikeDecomposition(
	S, Bindings, Src, DecompType, llvm::APSInt::get(VT->getNumElements()),
	S.Context.getQualifiedType(VT->getElementType(),
	DecompType.getQualifiers()));
	}

	static bool checkComplexDecomposition(Sema &S,
	ArrayRef<BindingDecl *> Bindings,
	ValueDecl *Src, QualType DecompType,
	const ComplexType *CT) {
	return checkSimpleDecomposition(
	S, Bindings, Src, DecompType, llvm::APSInt::get(2),
	S.Context.getQualifiedType(CT->getElementType(),
	DecompType.getQualifiers()),
	[&](SourceLocation Loc, Expr *Base, unsigned I) -> ExprResult {
	return S.CreateBuiltinUnaryOp(Loc, I ? UO_Imag : UO_Real, Base);
	});
	}

	static std::string printTemplateArgs(const PrintingPolicy &PrintingPolicy,
	TemplateArgumentListInfo &Args) {
	SmallString<128> SS;
	llvm::raw_svector_ostream OS(SS);
	bool First = true;
	for (auto &Arg : Args.arguments()) {
	if (!First)
	OS << ", ";
	Arg.getArgument().print(PrintingPolicy, OS);
	First = false;
	}
	return OS.str();
	}

	static bool lookupStdTypeTraitMember(Sema &S, LookupResult &TraitMemberLookup,
	SourceLocation Loc, StringRef Trait,
	TemplateArgumentListInfo &Args,
	unsigned DiagID) {
	auto DiagnoseMissing = [&] {
	if (DiagID)
	S.Diag(Loc, DiagID) << printTemplateArgs(S.Context.getPrintingPolicy(),
	Args);
	return true;
	};

	// FIXME: Factor out duplication with lookupPromiseType in SemaCoroutine.
	NamespaceDecl *Std = S.getStdNamespace();
	if (!Std)
	return DiagnoseMissing();

	// Look up the trait itself, within namespace std. We can diagnose various
	// problems with this lookup even if we've been asked to not diagnose a
	// missing specialization, because this can only fail if the user has been
	// declaring their own names in namespace std or we don't support the
	// standard library implementation in use.
	LookupResult Result(S, &S.PP.getIdentifierTable().get(Trait),
	Loc, Sema::LookupOrdinaryName);
	if (!S.LookupQualifiedName(Result, Std))
	return DiagnoseMissing();
	if (Result.isAmbiguous())
	return true;

	ClassTemplateDecl *TraitTD = Result.getAsSingle<ClassTemplateDecl>();
	if (!TraitTD) {
	Result.suppressDiagnostics();
	NamedDecl Found = Result.begin();
	S.Diag(Loc, diag::err_std_type_trait_not_class_template) << Trait;
	S.Diag(Found->getLocation(), diag::note_declared_at);
	return true;
	}

	// Build the template-id.
	QualType TraitTy = S.CheckTemplateIdType(TemplateName(TraitTD), Loc, Args);
	if (TraitTy.isNull())
	return true;
	if (!S.isCompleteType(Loc, TraitTy)) {
	if (DiagID)
	S.RequireCompleteType(
	Loc, TraitTy, DiagID,
	printTemplateArgs(S.Context.getPrintingPolicy(), Args));
	return true;
	}

	CXXRecordDecl *RD = TraitTy->getAsCXXRecordDecl();
	assert(RD && "specialization of class template is not a class?");

	// Look up the member of the trait type.
	S.LookupQualifiedName(TraitMemberLookup, RD);
	return TraitMemberLookup.isAmbiguous();
	}

	static TemplateArgumentLoc
	getTrivialIntegralTemplateArgument(Sema &S, SourceLocation Loc, QualType T,
	uint64_t I) {
	TemplateArgument Arg(S.Context, S.Context.MakeIntValue(I, T), T);
	return S.getTrivialTemplateArgumentLoc(Arg, T, Loc);
	}

	static TemplateArgumentLoc
	getTrivialTypeTemplateArgument(Sema &S, SourceLocation Loc, QualType T) {
	return S.getTrivialTemplateArgumentLoc(TemplateArgument(T), QualType(), Loc);
	}

	namespace { enum class IsTupleLike { TupleLike, NotTupleLike, Error }; }

	static IsTupleLike isTupleLike(Sema &S, SourceLocation Loc, QualType T,
	llvm::APSInt &Size) {
	EnterExpressionEvaluationContext ContextRAII(
	S, Sema::ExpressionEvaluationContext::ConstantEvaluated);

	DeclarationName Value = S.PP.getIdentifierInfo("value");
	LookupResult R(S, Value, Loc, Sema::LookupOrdinaryName);

	// Form template argument list for tuple_size<T>.
	TemplateArgumentListInfo Args(Loc, Loc);
	Args.addArgument(getTrivialTypeTemplateArgument(S, Loc, T));

	// If there's no tuple_size specialization or the lookup of 'value' is empty,
	// it's not tuple-like.
	if (lookupStdTypeTraitMember(S, R, Loc, "tuple_size", Args, /DiagID/ 0) \|\|
	R.empty())
	return IsTupleLike::NotTupleLike;

	// If we get this far, we've committed to the tuple interpretation, but
	// we can still fail if there actually isn't a usable ::value.

	struct ICEDiagnoser : Sema::VerifyICEDiagnoser {
	LookupResult &R;
	TemplateArgumentListInfo &Args;
	ICEDiagnoser(LookupResult &R, TemplateArgumentListInfo &Args)
	: R(R), Args(Args) {}
	void diagnoseNotICE(Sema &S, SourceLocation Loc, SourceRange SR) {
	S.Diag(Loc, diag::err_decomp_decl_std_tuple_size_not_constant)
	<< printTemplateArgs(S.Context.getPrintingPolicy(), Args);
	}
	} Diagnoser(R, Args);

	ExprResult E =
	S.BuildDeclarationNameExpr(CXXScopeSpec(), R, /NeedsADL/false);
	if (E.isInvalid())
	return IsTupleLike::Error;

	E = S.VerifyIntegerConstantExpression(E.get(), &Size, Diagnoser, false);
	if (E.isInvalid())
	return IsTupleLike::Error;

	return IsTupleLike::TupleLike;
	}

	/// \return std::tuple_element<I, T>::type.
	static QualType getTupleLikeElementType(Sema &S, SourceLocation Loc,
	unsigned I, QualType T) {
	// Form template argument list for tuple_element<I, T>.
	TemplateArgumentListInfo Args(Loc, Loc);
	Args.addArgument(
	getTrivialIntegralTemplateArgument(S, Loc, S.Context.getSizeType(), I));
	Args.addArgument(getTrivialTypeTemplateArgument(S, Loc, T));

	DeclarationName TypeDN = S.PP.getIdentifierInfo("type");
	LookupResult R(S, TypeDN, Loc, Sema::LookupOrdinaryName);
	if (lookupStdTypeTraitMember(
	S, R, Loc, "tuple_element", Args,
	diag::err_decomp_decl_std_tuple_element_not_specialized))
	return QualType();

	auto *TD = R.getAsSingle<TypeDecl>();
	if (!TD) {
	R.suppressDiagnostics();
	S.Diag(Loc, diag::err_decomp_decl_std_tuple_element_not_specialized)
	<< printTemplateArgs(S.Context.getPrintingPolicy(), Args);
	if (!R.empty())
	S.Diag(R.getRepresentativeDecl()->getLocation(), diag::note_declared_at);
	return QualType();
	}

	return S.Context.getTypeDeclType(TD);
	}

	namespace {
	struct BindingDiagnosticTrap {
	Sema &S;
	DiagnosticErrorTrap Trap;
	BindingDecl *BD;

	BindingDiagnosticTrap(Sema &S, BindingDecl *BD)
	: S(S), Trap(S.Diags), BD(BD) {}
	~BindingDiagnosticTrap() {
	if (Trap.hasErrorOccurred())
	S.Diag(BD->getLocation(), diag::note_in_binding_decl_init) << BD;
	}
	};
	}

	static bool checkTupleLikeDecomposition(Sema &S,
	ArrayRef<BindingDecl *> Bindings,
	VarDecl *Src, QualType DecompType,
	const llvm::APSInt &TupleSize) {
	if ((int64_t)Bindings.size() != TupleSize) {
	S.Diag(Src->getLocation(), diag::err_decomp_decl_wrong_number_bindings)
	<< DecompType << (unsigned)Bindings.size() << TupleSize.toString(10)
	<< (TupleSize < Bindings.size());
	return true;
	}

	if (Bindings.empty())
	return false;

	DeclarationName GetDN = S.PP.getIdentifierInfo("get");

	// [dcl.decomp]p3:
	// The unqualified-id get is looked up in the scope of E by class member
	// access lookup ...
	LookupResult MemberGet(S, GetDN, Src->getLocation(), Sema::LookupMemberName);
	bool UseMemberGet = false;
	if (S.isCompleteType(Src->getLocation(), DecompType)) {
	if (auto *RD = DecompType->getAsCXXRecordDecl())
	S.LookupQualifiedName(MemberGet, RD);
	if (MemberGet.isAmbiguous())
	return true;
	// ... and if that finds at least one declaration that is a function
	// template whose first template parameter is a non-type parameter ...
	for (NamedDecl *D : MemberGet) {
	if (FunctionTemplateDecl *FTD =
	dyn_cast<FunctionTemplateDecl>(D->getUnderlyingDecl())) {
	TemplateParameterList *TPL = FTD->getTemplateParameters();
	if (TPL->size() != 0 &&
	isa<NonTypeTemplateParmDecl>(TPL->getParam(0))) {
	// ... the initializer is e.get<i>().
	UseMemberGet = true;
	break;
	}
	}
	}
	}

	unsigned I = 0;
	for (auto *B : Bindings) {
	BindingDiagnosticTrap Trap(S, B);
	SourceLocation Loc = B->getLocation();

	ExprResult E = S.BuildDeclRefExpr(Src, DecompType, VK_LValue, Loc);
	if (E.isInvalid())
	return true;

	// e is an lvalue if the type of the entity is an lvalue reference and
	// an xvalue otherwise
	if (!Src->getType()->isLValueReferenceType())
	E = ImplicitCastExpr::Create(S.Context, E.get()->getType(), CK_NoOp,
	E.get(), nullptr, VK_XValue);

	TemplateArgumentListInfo Args(Loc, Loc);
	Args.addArgument(
	getTrivialIntegralTemplateArgument(S, Loc, S.Context.getSizeType(), I));

	if (UseMemberGet) {
	// if [lookup of member get] finds at least one declaration, the
	// initializer is e.get<i-1>().
	E = S.BuildMemberReferenceExpr(E.get(), DecompType, Loc, false,
	CXXScopeSpec(), SourceLocation(), nullptr,
	MemberGet, &Args, nullptr);
	if (E.isInvalid())
	return true;

	E = S.BuildCallExpr(nullptr, E.get(), Loc, None, Loc);
	} else {
	// Otherwise, the initializer is get<i-1>(e), where get is looked up
	// in the associated namespaces.
	Expr *Get = UnresolvedLookupExpr::Create(
	S.Context, nullptr, NestedNameSpecifierLoc(), SourceLocation(),
	DeclarationNameInfo(GetDN, Loc), /RequiresADL/true, &Args,
	UnresolvedSetIterator(), UnresolvedSetIterator());

	Expr *Arg = E.get();
	E = S.BuildCallExpr(nullptr, Get, Loc, Arg, Loc);
	}
	if (E.isInvalid())
	return true;
	Expr *Init = E.get();

	// Given the type T designated by std::tuple_element<i - 1, E>::type,
	QualType T = getTupleLikeElementType(S, Loc, I, DecompType);
	if (T.isNull())
	return true;

	// each vi is a variable of type "reference to T" initialized with the
	// initializer, where the reference is an lvalue reference if the
	// initializer is an lvalue and an rvalue reference otherwise
	QualType RefType =
	S.BuildReferenceType(T, E.get()->isLValue(), Loc, B->getDeclName());
	if (RefType.isNull())
	return true;
	auto *RefVD = VarDecl::Create(
	S.Context, Src->getDeclContext(), Loc, Loc,
	B->getDeclName().getAsIdentifierInfo(), RefType,
	S.Context.getTrivialTypeSourceInfo(T, Loc), Src->getStorageClass());
	RefVD->setLexicalDeclContext(Src->getLexicalDeclContext());
	RefVD->setTSCSpec(Src->getTSCSpec());
	RefVD->setImplicit();
	if (Src->isInlineSpecified())
	RefVD->setInlineSpecified();
	RefVD->getLexicalDeclContext()->addHiddenDecl(RefVD);

	InitializedEntity Entity = InitializedEntity::InitializeBinding(RefVD);
	InitializationKind Kind = InitializationKind::CreateCopy(Loc, Loc);
	InitializationSequence Seq(S, Entity, Kind, Init);
	E = Seq.Perform(S, Entity, Kind, Init);
	if (E.isInvalid())
	return true;
	E = S.ActOnFinishFullExpr(E.get(), Loc, /DiscardedValue/ false);
	if (E.isInvalid())
	return true;
	RefVD->setInit(E.get());
	- RefVD->checkInitIsICE();
	+ if (!E.get()->isValueDependent())
	+ RefVD->checkInitIsICE();

	E = S.BuildDeclarationNameExpr(CXXScopeSpec(),
	DeclarationNameInfo(B->getDeclName(), Loc),
	RefVD);
	if (E.isInvalid())
	return true;

	B->setBinding(T, E.get());
	I++;
	}

	return false;
	}

	/// Find the base class to decompose in a built-in decomposition of a class type.
	/// This base class search is, unfortunately, not quite like any other that we
	/// perform anywhere else in C++.
	static DeclAccessPair findDecomposableBaseClass(Sema &S, SourceLocation Loc,
	const CXXRecordDecl *RD,
	CXXCastPath &BasePath) {
	auto BaseHasFields = [](const CXXBaseSpecifier *Specifier,
	CXXBasePath &Path) {
	return Specifier->getType()->getAsCXXRecordDecl()->hasDirectFields();
	};

	const CXXRecordDecl *ClassWithFields = nullptr;
	AccessSpecifier AS = AS_public;
	if (RD->hasDirectFields())
	// [dcl.decomp]p4:
	// Otherwise, all of E's non-static data members shall be public direct
	// members of E ...
	ClassWithFields = RD;
	else {
	// ... or of ...
	CXXBasePaths Paths;
	Paths.setOrigin(const_cast<CXXRecordDecl*>(RD));
	if (!RD->lookupInBases(BaseHasFields, Paths)) {
	// If no classes have fields, just decompose RD itself. (This will work
	// if and only if zero bindings were provided.)
	return DeclAccessPair::make(const_cast<CXXRecordDecl*>(RD), AS_public);
	}

	CXXBasePath *BestPath = nullptr;
	for (auto &P : Paths) {
	if (!BestPath)
	BestPath = &P;
	else if (!S.Context.hasSameType(P.back().Base->getType(),
	BestPath->back().Base->getType())) {
	// ... the same ...
	S.Diag(Loc, diag::err_decomp_decl_multiple_bases_with_members)
	<< false << RD << BestPath->back().Base->getType()
	<< P.back().Base->getType();
	return DeclAccessPair();
	} else if (P.Access < BestPath->Access) {
	BestPath = &P;
	}
	}

	// ... unambiguous ...
	QualType BaseType = BestPath->back().Base->getType();
	if (Paths.isAmbiguous(S.Context.getCanonicalType(BaseType))) {
	S.Diag(Loc, diag::err_decomp_decl_ambiguous_base)
	<< RD << BaseType << S.getAmbiguousPathsDisplayString(Paths);
	return DeclAccessPair();
	}

	// ... [accessible, implied by other rules] base class of E.
	S.CheckBaseClassAccess(Loc, BaseType, S.Context.getRecordType(RD),
	*BestPath, diag::err_decomp_decl_inaccessible_base);
	AS = BestPath->Access;

	ClassWithFields = BaseType->getAsCXXRecordDecl();
	S.BuildBasePathArray(Paths, BasePath);
	}

	// The above search did not check whether the selected class itself has base
	// classes with fields, so check that now.
	CXXBasePaths Paths;
	if (ClassWithFields->lookupInBases(BaseHasFields, Paths)) {
	S.Diag(Loc, diag::err_decomp_decl_multiple_bases_with_members)
	<< (ClassWithFields == RD) << RD << ClassWithFields
	<< Paths.front().back().Base->getType();
	return DeclAccessPair();
	}

	return DeclAccessPair::make(const_cast<CXXRecordDecl*>(ClassWithFields), AS);
	}

	static bool checkMemberDecomposition(Sema &S, ArrayRef<BindingDecl*> Bindings,
	ValueDecl *Src, QualType DecompType,
	const CXXRecordDecl *OrigRD) {
	if (S.RequireCompleteType(Src->getLocation(), DecompType,
	diag::err_incomplete_type))
	return true;

	CXXCastPath BasePath;
	DeclAccessPair BasePair =
	findDecomposableBaseClass(S, Src->getLocation(), OrigRD, BasePath);
	const CXXRecordDecl *RD = cast_or_null<CXXRecordDecl>(BasePair.getDecl());
	if (!RD)
	return true;
	QualType BaseType = S.Context.getQualifiedType(S.Context.getRecordType(RD),
	DecompType.getQualifiers());

	auto DiagnoseBadNumberOfBindings = [&]() -> bool {
	unsigned NumFields =
	std::count_if(RD->field_begin(), RD->field_end(),
	[](FieldDecl *FD) { return !FD->isUnnamedBitfield(); });
	assert(Bindings.size() != NumFields);
	S.Diag(Src->getLocation(), diag::err_decomp_decl_wrong_number_bindings)
	<< DecompType << (unsigned)Bindings.size() << NumFields
	<< (NumFields < Bindings.size());
	return true;
	};

	// all of E's non-static data members shall be [...] well-formed
	// when named as e.name in the context of the structured binding,
	// E shall not have an anonymous union member, ...
	unsigned I = 0;
	for (auto *FD : RD->fields()) {
	if (FD->isUnnamedBitfield())
	continue;

	if (FD->isAnonymousStructOrUnion()) {
	S.Diag(Src->getLocation(), diag::err_decomp_decl_anon_union_member)
	<< DecompType << FD->getType()->isUnionType();
	S.Diag(FD->getLocation(), diag::note_declared_at);
	return true;
	}

	// We have a real field to bind.
	if (I >= Bindings.size())
	return DiagnoseBadNumberOfBindings();
	auto *B = Bindings[I++];
	SourceLocation Loc = B->getLocation();

	// The field must be accessible in the context of the structured binding.
	// We already checked that the base class is accessible.
	// FIXME: Add 'const' to AccessedEntity's classes so we can remove the
	// const_cast here.
	S.CheckStructuredBindingMemberAccess(
	Loc, const_cast<CXXRecordDecl *>(OrigRD),
	DeclAccessPair::make(FD, CXXRecordDecl::MergeAccess(
	BasePair.getAccess(), FD->getAccess())));

	// Initialize the binding to Src.FD.
	ExprResult E = S.BuildDeclRefExpr(Src, DecompType, VK_LValue, Loc);
	if (E.isInvalid())
	return true;
	E = S.ImpCastExprToType(E.get(), BaseType, CK_UncheckedDerivedToBase,
	VK_LValue, &BasePath);
	if (E.isInvalid())
	return true;
	E = S.BuildFieldReferenceExpr(E.get(), /IsArrow/ false, Loc,
	CXXScopeSpec(), FD,
	DeclAccessPair::make(FD, FD->getAccess()),
	DeclarationNameInfo(FD->getDeclName(), Loc));
	if (E.isInvalid())
	return true;

	// If the type of the member is T, the referenced type is cv T, where cv is
	// the cv-qualification of the decomposition expression.
	//
	// FIXME: We resolve a defect here: if the field is mutable, we do not add
	// 'const' to the type of the field.
	Qualifiers Q = DecompType.getQualifiers();
	if (FD->isMutable())
	Q.removeConst();
	B->setBinding(S.BuildQualifiedType(FD->getType(), Loc, Q), E.get());
	}

	if (I != Bindings.size())
	return DiagnoseBadNumberOfBindings();

	return false;
	}

	void Sema::CheckCompleteDecompositionDeclaration(DecompositionDecl *DD) {
	QualType DecompType = DD->getType();

	// If the type of the decomposition is dependent, then so is the type of
	// each binding.
	if (DecompType->isDependentType()) {
	for (auto *B : DD->bindings())
	B->setType(Context.DependentTy);
	return;
	}

	DecompType = DecompType.getNonReferenceType();
	ArrayRef<BindingDecl*> Bindings = DD->bindings();

	// C++1z [dcl.decomp]/2:
	// If E is an array type [...]
	// As an extension, we also support decomposition of built-in complex and
	// vector types.
	if (auto *CAT = Context.getAsConstantArrayType(DecompType)) {
	if (checkArrayDecomposition(*this, Bindings, DD, DecompType, CAT))
	DD->setInvalidDecl();
	return;
	}
	if (auto *VT = DecompType->getAs<VectorType>()) {
	if (checkVectorDecomposition(*this, Bindings, DD, DecompType, VT))
	DD->setInvalidDecl();
	return;
	}
	if (auto *CT = DecompType->getAs<ComplexType>()) {
	if (checkComplexDecomposition(*this, Bindings, DD, DecompType, CT))
	DD->setInvalidDecl();
	return;
	}

	// C++1z [dcl.decomp]/3:
	// if the expression std::tuple_size<E>::value is a well-formed integral
	// constant expression, [...]
	llvm::APSInt TupleSize(32);
	switch (isTupleLike(*this, DD->getLocation(), DecompType, TupleSize)) {
	case IsTupleLike::Error:
	DD->setInvalidDecl();
	return;

	case IsTupleLike::TupleLike:
	if (checkTupleLikeDecomposition(*this, Bindings, DD, DecompType, TupleSize))
	DD->setInvalidDecl();
	return;

	case IsTupleLike::NotTupleLike:
	break;
	}

	// C++1z [dcl.dcl]/8:
	// [E shall be of array or non-union class type]
	CXXRecordDecl *RD = DecompType->getAsCXXRecordDecl();
	if (!RD \|\| RD->isUnion()) {
	Diag(DD->getLocation(), diag::err_decomp_decl_unbindable_type)
	<< DD << !RD << DecompType;
	DD->setInvalidDecl();
	return;
	}

	// C++1z [dcl.decomp]/4:
	// all of E's non-static data members shall be [...] direct members of
	// E or of the same unambiguous public base class of E, ...
	if (checkMemberDecomposition(*this, Bindings, DD, DecompType, RD))
	DD->setInvalidDecl();
	}

	/// Merge the exception specifications of two variable declarations.
	///
	/// This is called when there's a redeclaration of a VarDecl. The function
	/// checks if the redeclaration might have an exception specification and
	/// validates compatibility and merges the specs if necessary.
	void Sema::MergeVarDeclExceptionSpecs(VarDecl New, VarDecl Old) {
	// Shortcut if exceptions are disabled.
	if (!getLangOpts().CXXExceptions)
	return;

	assert(Context.hasSameType(New->getType(), Old->getType()) &&
	"Should only be called if types are otherwise the same.");

	QualType NewType = New->getType();
	QualType OldType = Old->getType();

	// We're only interested in pointers and references to functions, as well
	// as pointers to member functions.
	if (const ReferenceType *R = NewType->getAs<ReferenceType>()) {
	NewType = R->getPointeeType();
	OldType = OldType->getAs<ReferenceType>()->getPointeeType();
	} else if (const PointerType *P = NewType->getAs<PointerType>()) {
	NewType = P->getPointeeType();
	OldType = OldType->getAs<PointerType>()->getPointeeType();
	} else if (const MemberPointerType *M = NewType->getAs<MemberPointerType>()) {
	NewType = M->getPointeeType();
	OldType = OldType->getAs<MemberPointerType>()->getPointeeType();
	}

	if (!NewType->isFunctionProtoType())
	return;

	// There's lots of special cases for functions. For function pointers, system
	// libraries are hopefully not as broken so that we don't need these
	// workarounds.
	if (CheckEquivalentExceptionSpec(
	OldType->getAs<FunctionProtoType>(), Old->getLocation(),
	NewType->getAs<FunctionProtoType>(), New->getLocation())) {
	New->setInvalidDecl();
	}
	}

	/// CheckCXXDefaultArguments - Verify that the default arguments for a
	/// function declaration are well-formed according to C++
	/// [dcl.fct.default].
	void Sema::CheckCXXDefaultArguments(FunctionDecl *FD) {
	unsigned NumParams = FD->getNumParams();
	unsigned p;

	// Find first parameter with a default argument
	for (p = 0; p < NumParams; ++p) {
	ParmVarDecl *Param = FD->getParamDecl(p);
	if (Param->hasDefaultArg())
	break;
	}

	// C++11 [dcl.fct.default]p4:
	// In a given function declaration, each parameter subsequent to a parameter
	// with a default argument shall have a default argument supplied in this or
	// a previous declaration or shall be a function parameter pack. A default
	// argument shall not be redefined by a later declaration (not even to the
	// same value).
	unsigned LastMissingDefaultArg = 0;
	for (; p < NumParams; ++p) {
	ParmVarDecl *Param = FD->getParamDecl(p);
	if (!Param->hasDefaultArg() && !Param->isParameterPack()) {
	if (Param->isInvalidDecl())
	/* We already complained about this parameter. */;
	else if (Param->getIdentifier())
	Diag(Param->getLocation(),
	diag::err_param_default_argument_missing_name)
	<< Param->getIdentifier();
	else
	Diag(Param->getLocation(),
	diag::err_param_default_argument_missing);

	LastMissingDefaultArg = p;
	}
	}

	if (LastMissingDefaultArg > 0) {
	// Some default arguments were missing. Clear out all of the
	// default arguments up to (and including) the last missing
	// default argument, so that we leave the function parameters
	// in a semantically valid state.
	for (p = 0; p <= LastMissingDefaultArg; ++p) {
	ParmVarDecl *Param = FD->getParamDecl(p);
	if (Param->hasDefaultArg()) {
	Param->setDefaultArg(nullptr);
	}
	}
	}
	}

	// CheckConstexprParameterTypes - Check whether a function's parameter types
	// are all literal types. If so, return true. If not, produce a suitable
	// diagnostic and return false.
	static bool CheckConstexprParameterTypes(Sema &SemaRef,
	const FunctionDecl *FD) {
	unsigned ArgIndex = 0;
	const FunctionProtoType *FT = FD->getType()->getAs<FunctionProtoType>();
	for (FunctionProtoType::param_type_iterator i = FT->param_type_begin(),
	e = FT->param_type_end();
	i != e; ++i, ++ArgIndex) {
	const ParmVarDecl *PD = FD->getParamDecl(ArgIndex);
	SourceLocation ParamLoc = PD->getLocation();
	if (!(*i)->isDependentType() &&
	SemaRef.RequireLiteralType(
	ParamLoc, *i, diag::err_constexpr_non_literal_param, ArgIndex + 1,
	PD->getSourceRange(), isa<CXXConstructorDecl>(FD),
	FD->isConsteval()))
	return false;
	}
	return true;
	}

	/// Get diagnostic %select index for tag kind for
	/// record diagnostic message.
	/// WARNING: Indexes apply to particular diagnostics only!
	///
	/// \returns diagnostic %select index.
	static unsigned getRecordDiagFromTagKind(TagTypeKind Tag) {
	switch (Tag) {
	case TTK_Struct: return 0;
	case TTK_Interface: return 1;
	case TTK_Class: return 2;
	default: llvm_unreachable("Invalid tag kind for record diagnostic!");
	}
	}

	// CheckConstexprFunctionDecl - Check whether a function declaration satisfies
	// the requirements of a constexpr function definition or a constexpr
	// constructor definition. If so, return true. If not, produce appropriate
	// diagnostics and return false.
	//
	// This implements C++11 [dcl.constexpr]p3,4, as amended by DR1360.
	bool Sema::CheckConstexprFunctionDecl(const FunctionDecl *NewFD) {
	const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(NewFD);
	if (MD && MD->isInstance()) {
	// C++11 [dcl.constexpr]p4:
	// The definition of a constexpr constructor shall satisfy the following
	// constraints:
	// - the class shall not have any virtual base classes;
	//
	// FIXME: This only applies to constructors, not arbitrary member
	// functions.
	const CXXRecordDecl *RD = MD->getParent();
	if (RD->getNumVBases()) {
	Diag(NewFD->getLocation(), diag::err_constexpr_virtual_base)
	<< isa<CXXConstructorDecl>(NewFD)
	<< getRecordDiagFromTagKind(RD->getTagKind()) << RD->getNumVBases();
	for (const auto &I : RD->vbases())
	Diag(I.getBeginLoc(), diag::note_constexpr_virtual_base_here)
	<< I.getSourceRange();
	return false;
	}
	}

	if (!isa<CXXConstructorDecl>(NewFD)) {
	// C++11 [dcl.constexpr]p3:
	// The definition of a constexpr function shall satisfy the following
	// constraints:
	// - it shall not be virtual; (removed in C++20)
	const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(NewFD);
	if (Method && Method->isVirtual()) {
	if (getLangOpts().CPlusPlus2a) {
	Diag(Method->getLocation(), diag::warn_cxx17_compat_constexpr_virtual);
	} else {
	Method = Method->getCanonicalDecl();
	Diag(Method->getLocation(), diag::err_constexpr_virtual);

	// If it's not obvious why this function is virtual, find an overridden
	// function which uses the 'virtual' keyword.
	const CXXMethodDecl *WrittenVirtual = Method;
	while (!WrittenVirtual->isVirtualAsWritten())
	WrittenVirtual = *WrittenVirtual->begin_overridden_methods();
	if (WrittenVirtual != Method)
	Diag(WrittenVirtual->getLocation(),
	diag::note_overridden_virtual_function);
	return false;
	}
	}

	// - its return type shall be a literal type;
	QualType RT = NewFD->getReturnType();
	if (!RT->isDependentType() &&
	RequireLiteralType(NewFD->getLocation(), RT,
	diag::err_constexpr_non_literal_return,
	NewFD->isConsteval()))
	return false;
	}

	// - each of its parameter types shall be a literal type;
	if (!CheckConstexprParameterTypes(*this, NewFD))
	return false;

	return true;
	}

	/// Check the given declaration statement is legal within a constexpr function
	/// body. C++11 [dcl.constexpr]p3,p4, and C++1y [dcl.constexpr]p3.
	///
	/// \return true if the body is OK (maybe only as an extension), false if we
	/// have diagnosed a problem.
	static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl,
	DeclStmt *DS, SourceLocation &Cxx1yLoc) {
	// C++11 [dcl.constexpr]p3 and p4:
	// The definition of a constexpr function(p3) or constructor(p4) [...] shall
	// contain only
	for (const auto *DclIt : DS->decls()) {
	switch (DclIt->getKind()) {
	case Decl::StaticAssert:
	case Decl::Using:
	case Decl::UsingShadow:
	case Decl::UsingDirective:
	case Decl::UnresolvedUsingTypename:
	case Decl::UnresolvedUsingValue:
	// - static_assert-declarations
	// - using-declarations,
	// - using-directives,
	continue;

	case Decl::Typedef:
	case Decl::TypeAlias: {
	// - typedef declarations and alias-declarations that do not define
	// classes or enumerations,
	const auto *TN = cast<TypedefNameDecl>(DclIt);
	if (TN->getUnderlyingType()->isVariablyModifiedType()) {
	// Don't allow variably-modified types in constexpr functions.
	TypeLoc TL = TN->getTypeSourceInfo()->getTypeLoc();
	SemaRef.Diag(TL.getBeginLoc(), diag::err_constexpr_vla)
	<< TL.getSourceRange() << TL.getType()
	<< isa<CXXConstructorDecl>(Dcl);
	return false;
	}
	continue;
	}

	case Decl::Enum:
	case Decl::CXXRecord:
	// C++1y allows types to be defined, not just declared.
	if (cast<TagDecl>(DclIt)->isThisDeclarationADefinition())
	SemaRef.Diag(DS->getBeginLoc(),
	SemaRef.getLangOpts().CPlusPlus14
	? diag::warn_cxx11_compat_constexpr_type_definition
	: diag::ext_constexpr_type_definition)
	<< isa<CXXConstructorDecl>(Dcl);
	continue;

	case Decl::EnumConstant:
	case Decl::IndirectField:
	case Decl::ParmVar:
	// These can only appear with other declarations which are banned in
	// C++11 and permitted in C++1y, so ignore them.
	continue;

	case Decl::Var:
	case Decl::Decomposition: {
	// C++1y [dcl.constexpr]p3 allows anything except:
	// a definition of a variable of non-literal type or of static or
	// thread storage duration or for which no initialization is performed.
	const auto *VD = cast<VarDecl>(DclIt);
	if (VD->isThisDeclarationADefinition()) {
	if (VD->isStaticLocal()) {
	SemaRef.Diag(VD->getLocation(),
	diag::err_constexpr_local_var_static)
	<< isa<CXXConstructorDecl>(Dcl)
	<< (VD->getTLSKind() == VarDecl::TLS_Dynamic);
	return false;
	}
	if (!VD->getType()->isDependentType() &&
	SemaRef.RequireLiteralType(
	VD->getLocation(), VD->getType(),
	diag::err_constexpr_local_var_non_literal_type,
	isa<CXXConstructorDecl>(Dcl)))
	return false;
	if (!VD->getType()->isDependentType() &&
	!VD->hasInit() && !VD->isCXXForRangeDecl()) {
	SemaRef.Diag(VD->getLocation(),
	diag::err_constexpr_local_var_no_init)
	<< isa<CXXConstructorDecl>(Dcl);
	return false;
	}
	}
	SemaRef.Diag(VD->getLocation(),
	SemaRef.getLangOpts().CPlusPlus14
	? diag::warn_cxx11_compat_constexpr_local_var
	: diag::ext_constexpr_local_var)
	<< isa<CXXConstructorDecl>(Dcl);
	continue;
	}

	case Decl::NamespaceAlias:
	case Decl::Function:
	// These are disallowed in C++11 and permitted in C++1y. Allow them
	// everywhere as an extension.
	if (!Cxx1yLoc.isValid())
	Cxx1yLoc = DS->getBeginLoc();
	continue;

	default:
	SemaRef.Diag(DS->getBeginLoc(), diag::err_constexpr_body_invalid_stmt)
	<< isa<CXXConstructorDecl>(Dcl) << Dcl->isConsteval();
	return false;
	}
	}

	return true;
	}

	/// Check that the given field is initialized within a constexpr constructor.
	///
	/// \param Dcl The constexpr constructor being checked.
	/// \param Field The field being checked. This may be a member of an anonymous
	/// struct or union nested within the class being checked.
	/// \param Inits All declarations, including anonymous struct/union members and
	/// indirect members, for which any initialization was provided.
	/// \param Diagnosed Set to true if an error is produced.
	static void CheckConstexprCtorInitializer(Sema &SemaRef,
	const FunctionDecl *Dcl,
	FieldDecl *Field,
	llvm::SmallSet<Decl*, 16> &Inits,
	bool &Diagnosed) {
	if (Field->isInvalidDecl())
	return;

	if (Field->isUnnamedBitfield())
	return;

	// Anonymous unions with no variant members and empty anonymous structs do not
	// need to be explicitly initialized. FIXME: Anonymous structs that contain no
	// indirect fields don't need initializing.
	if (Field->isAnonymousStructOrUnion() &&
	(Field->getType()->isUnionType()
	? !Field->getType()->getAsCXXRecordDecl()->hasVariantMembers()
	: Field->getType()->getAsCXXRecordDecl()->isEmpty()))
	return;

	if (!Inits.count(Field)) {
	if (!Diagnosed) {
	SemaRef.Diag(Dcl->getLocation(), diag::err_constexpr_ctor_missing_init);
	Diagnosed = true;
	}
	SemaRef.Diag(Field->getLocation(), diag::note_constexpr_ctor_missing_init);
	} else if (Field->isAnonymousStructOrUnion()) {
	const RecordDecl *RD = Field->getType()->castAs<RecordType>()->getDecl();
	for (auto *I : RD->fields())
	// If an anonymous union contains an anonymous struct of which any member
	// is initialized, all members must be initialized.
	if (!RD->isUnion() \|\| Inits.count(I))
	CheckConstexprCtorInitializer(SemaRef, Dcl, I, Inits, Diagnosed);
	}
	}

	/// Check the provided statement is allowed in a constexpr function
	/// definition.
	static bool
	CheckConstexprFunctionStmt(Sema &SemaRef, const FunctionDecl Dcl, Stmt S,
	SmallVectorImpl<SourceLocation> &ReturnStmts,
	SourceLocation &Cxx1yLoc, SourceLocation &Cxx2aLoc) {
	// - its function-body shall be [...] a compound-statement that contains only
	switch (S->getStmtClass()) {
	case Stmt::NullStmtClass:
	// - null statements,
	return true;

	case Stmt::DeclStmtClass:
	// - static_assert-declarations
	// - using-declarations,
	// - using-directives,
	// - typedef declarations and alias-declarations that do not define
	// classes or enumerations,
	if (!CheckConstexprDeclStmt(SemaRef, Dcl, cast<DeclStmt>(S), Cxx1yLoc))
	return false;
	return true;

	case Stmt::ReturnStmtClass:
	// - and exactly one return statement;
	if (isa<CXXConstructorDecl>(Dcl)) {
	// C++1y allows return statements in constexpr constructors.
	if (!Cxx1yLoc.isValid())
	Cxx1yLoc = S->getBeginLoc();
	return true;
	}

	ReturnStmts.push_back(S->getBeginLoc());
	return true;

	case Stmt::CompoundStmtClass: {
	// C++1y allows compound-statements.
	if (!Cxx1yLoc.isValid())
	Cxx1yLoc = S->getBeginLoc();

	CompoundStmt *CompStmt = cast<CompoundStmt>(S);
	for (auto *BodyIt : CompStmt->body()) {
	if (!CheckConstexprFunctionStmt(SemaRef, Dcl, BodyIt, ReturnStmts,
	Cxx1yLoc, Cxx2aLoc))
	return false;
	}
	return true;
	}

	case Stmt::AttributedStmtClass:
	if (!Cxx1yLoc.isValid())
	Cxx1yLoc = S->getBeginLoc();
	return true;

	case Stmt::IfStmtClass: {
	// C++1y allows if-statements.
	if (!Cxx1yLoc.isValid())
	Cxx1yLoc = S->getBeginLoc();

	IfStmt *If = cast<IfStmt>(S);
	if (!CheckConstexprFunctionStmt(SemaRef, Dcl, If->getThen(), ReturnStmts,
	Cxx1yLoc, Cxx2aLoc))
	return false;
	if (If->getElse() &&
	!CheckConstexprFunctionStmt(SemaRef, Dcl, If->getElse(), ReturnStmts,
	Cxx1yLoc, Cxx2aLoc))
	return false;
	return true;
	}

	case Stmt::WhileStmtClass:
	case Stmt::DoStmtClass:
	case Stmt::ForStmtClass:
	case Stmt::CXXForRangeStmtClass:
	case Stmt::ContinueStmtClass:
	// C++1y allows all of these. We don't allow them as extensions in C++11,
	// because they don't make sense without variable mutation.
	if (!SemaRef.getLangOpts().CPlusPlus14)
	break;
	if (!Cxx1yLoc.isValid())
	Cxx1yLoc = S->getBeginLoc();
	for (Stmt *SubStmt : S->children())
	if (SubStmt &&
	!CheckConstexprFunctionStmt(SemaRef, Dcl, SubStmt, ReturnStmts,
	Cxx1yLoc, Cxx2aLoc))
	return false;
	return true;

	case Stmt::SwitchStmtClass:
	case Stmt::CaseStmtClass:
	case Stmt::DefaultStmtClass:
	case Stmt::BreakStmtClass:
	// C++1y allows switch-statements, and since they don't need variable
	// mutation, we can reasonably allow them in C++11 as an extension.
	if (!Cxx1yLoc.isValid())
	Cxx1yLoc = S->getBeginLoc();
	for (Stmt *SubStmt : S->children())
	if (SubStmt &&
	!CheckConstexprFunctionStmt(SemaRef, Dcl, SubStmt, ReturnStmts,
	Cxx1yLoc, Cxx2aLoc))
	return false;
	return true;

	case Stmt::CXXTryStmtClass:
	if (Cxx2aLoc.isInvalid())
	Cxx2aLoc = S->getBeginLoc();
	for (Stmt *SubStmt : S->children()) {
	if (SubStmt &&
	!CheckConstexprFunctionStmt(SemaRef, Dcl, SubStmt, ReturnStmts,
	Cxx1yLoc, Cxx2aLoc))
	return false;
	}
	return true;

	case Stmt::CXXCatchStmtClass:
	// Do not bother checking the language mode (already covered by the
	// try block check).
	if (!CheckConstexprFunctionStmt(SemaRef, Dcl,
	cast<CXXCatchStmt>(S)->getHandlerBlock(),
	ReturnStmts, Cxx1yLoc, Cxx2aLoc))
	return false;
	return true;

	default:
	if (!isa<Expr>(S))
	break;

	// C++1y allows expression-statements.
	if (!Cxx1yLoc.isValid())
	Cxx1yLoc = S->getBeginLoc();
	return true;
	}

	SemaRef.Diag(S->getBeginLoc(), diag::err_constexpr_body_invalid_stmt)
	<< isa<CXXConstructorDecl>(Dcl) << Dcl->isConsteval();
	return false;
	}

	/// Check the body for the given constexpr function declaration only contains
	/// the permitted types of statement. C++11 [dcl.constexpr]p3,p4.
	///
	/// \return true if the body is OK, false if we have diagnosed a problem.
	bool Sema::CheckConstexprFunctionBody(const FunctionDecl Dcl, Stmt Body) {
	SmallVector<SourceLocation, 4> ReturnStmts;

	if (isa<CXXTryStmt>(Body)) {
	// C++11 [dcl.constexpr]p3:
	// The definition of a constexpr function shall satisfy the following
	// constraints: [...]
	// - its function-body shall be = delete, = default, or a
	// compound-statement
	//
	// C++11 [dcl.constexpr]p4:
	// In the definition of a constexpr constructor, [...]
	// - its function-body shall not be a function-try-block;
	//
	// This restriction is lifted in C++2a, as long as inner statements also
	// apply the general constexpr rules.
	Diag(Body->getBeginLoc(),
	!getLangOpts().CPlusPlus2a
	? diag::ext_constexpr_function_try_block_cxx2a
	: diag::warn_cxx17_compat_constexpr_function_try_block)
	<< isa<CXXConstructorDecl>(Dcl);
	}

	// - its function-body shall be [...] a compound-statement that contains only
	// [... list of cases ...]
	//
	// Note that walking the children here is enough to properly check for
	// CompoundStmt and CXXTryStmt body.
	SourceLocation Cxx1yLoc, Cxx2aLoc;
	for (Stmt *SubStmt : Body->children()) {
	if (SubStmt &&
	!CheckConstexprFunctionStmt(*this, Dcl, SubStmt, ReturnStmts,
	Cxx1yLoc, Cxx2aLoc))
	return false;
	}

	if (Cxx2aLoc.isValid())
	Diag(Cxx2aLoc,
	getLangOpts().CPlusPlus2a
	? diag::warn_cxx17_compat_constexpr_body_invalid_stmt
	: diag::ext_constexpr_body_invalid_stmt_cxx2a)
	<< isa<CXXConstructorDecl>(Dcl);
	if (Cxx1yLoc.isValid())
	Diag(Cxx1yLoc,
	getLangOpts().CPlusPlus14
	? diag::warn_cxx11_compat_constexpr_body_invalid_stmt
	: diag::ext_constexpr_body_invalid_stmt)
	<< isa<CXXConstructorDecl>(Dcl);

	if (const CXXConstructorDecl *Constructor
	= dyn_cast<CXXConstructorDecl>(Dcl)) {
	const CXXRecordDecl *RD = Constructor->getParent();
	// DR1359:
	// - every non-variant non-static data member and base class sub-object
	// shall be initialized;
	// DR1460:
	// - if the class is a union having variant members, exactly one of them
	// shall be initialized;
	if (RD->isUnion()) {
	if (Constructor->getNumCtorInitializers() == 0 &&
	RD->hasVariantMembers()) {
	Diag(Dcl->getLocation(), diag::err_constexpr_union_ctor_no_init);
	return false;
	}
	} else if (!Constructor->isDependentContext() &&
	!Constructor->isDelegatingConstructor()) {
	assert(RD->getNumVBases() == 0 && "constexpr ctor with virtual bases");

	// Skip detailed checking if we have enough initializers, and we would
	// allow at most one initializer per member.
	bool AnyAnonStructUnionMembers = false;
	unsigned Fields = 0;
	for (CXXRecordDecl::field_iterator I = RD->field_begin(),
	E = RD->field_end(); I != E; ++I, ++Fields) {
	if (I->isAnonymousStructOrUnion()) {
	AnyAnonStructUnionMembers = true;
	break;
	}
	}
	// DR1460:
	// - if the class is a union-like class, but is not a union, for each of
	// its anonymous union members having variant members, exactly one of
	// them shall be initialized;
	if (AnyAnonStructUnionMembers \|\|
	Constructor->getNumCtorInitializers() != RD->getNumBases() + Fields) {
	// Check initialization of non-static data members. Base classes are
	// always initialized so do not need to be checked. Dependent bases
	// might not have initializers in the member initializer list.
	llvm::SmallSet<Decl*, 16> Inits;
	for (const auto *I: Constructor->inits()) {
	if (FieldDecl *FD = I->getMember())
	Inits.insert(FD);
	else if (IndirectFieldDecl *ID = I->getIndirectMember())
	Inits.insert(ID->chain_begin(), ID->chain_end());
	}

	bool Diagnosed = false;
	for (auto *I : RD->fields())
	CheckConstexprCtorInitializer(*this, Dcl, I, Inits, Diagnosed);
	if (Diagnosed)
	return false;
	}
	}
	} else {
	if (ReturnStmts.empty()) {
	// C++1y doesn't require constexpr functions to contain a 'return'
	// statement. We still do, unless the return type might be void, because
	// otherwise if there's no return statement, the function cannot
	// be used in a core constant expression.
	bool OK = getLangOpts().CPlusPlus14 &&
	(Dcl->getReturnType()->isVoidType() \|\|
	Dcl->getReturnType()->isDependentType());
	Diag(Dcl->getLocation(),
	OK ? diag::warn_cxx11_compat_constexpr_body_no_return
	: diag::err_constexpr_body_no_return)
	<< Dcl->isConsteval();
	if (!OK)
	return false;
	} else if (ReturnStmts.size() > 1) {
	Diag(ReturnStmts.back(),
	getLangOpts().CPlusPlus14
	? diag::warn_cxx11_compat_constexpr_body_multiple_return
	: diag::ext_constexpr_body_multiple_return);
	for (unsigned I = 0; I < ReturnStmts.size() - 1; ++I)
	Diag(ReturnStmts[I], diag::note_constexpr_body_previous_return);
	}
	}

	// C++11 [dcl.constexpr]p5:
	// if no function argument values exist such that the function invocation
	// substitution would produce a constant expression, the program is
	// ill-formed; no diagnostic required.
	// C++11 [dcl.constexpr]p3:
	// - every constructor call and implicit conversion used in initializing the
	// return value shall be one of those allowed in a constant expression.
	// C++11 [dcl.constexpr]p4:
	// - every constructor involved in initializing non-static data members and
	// base class sub-objects shall be a constexpr constructor.
	SmallVector<PartialDiagnosticAt, 8> Diags;
	if (!Expr::isPotentialConstantExpr(Dcl, Diags)) {
	Diag(Dcl->getLocation(), diag::ext_constexpr_function_never_constant_expr)
	<< isa<CXXConstructorDecl>(Dcl);
	for (size_t I = 0, N = Diags.size(); I != N; ++I)
	Diag(Diags[I].first, Diags[I].second);
	// Don't return false here: we allow this for compatibility in
	// system headers.
	}

	return true;
	}

	/// Get the class that is directly named by the current context. This is the
	/// class for which an unqualified-id in this scope could name a constructor
	/// or destructor.
	///
	/// If the scope specifier denotes a class, this will be that class.
	/// If the scope specifier is empty, this will be the class whose
	/// member-specification we are currently within. Otherwise, there
	/// is no such class.
	CXXRecordDecl Sema::getCurrentClass(Scope , const CXXScopeSpec *SS) {
	assert(getLangOpts().CPlusPlus && "No class names in C!");

	if (SS && SS->isInvalid())
	return nullptr;

	if (SS && SS->isNotEmpty()) {
	DeclContext DC = computeDeclContext(SS, true);
	return dyn_cast_or_null<CXXRecordDecl>(DC);
	}

	return dyn_cast_or_null<CXXRecordDecl>(CurContext);
	}

	/// isCurrentClassName - Determine whether the identifier II is the
	/// name of the class type currently being defined. In the case of
	/// nested classes, this will only return true if II is the name of
	/// the innermost class.
	bool Sema::isCurrentClassName(const IdentifierInfo &II, Scope *S,
	const CXXScopeSpec *SS) {
	CXXRecordDecl *CurDecl = getCurrentClass(S, SS);
	return CurDecl && &II == CurDecl->getIdentifier();
	}

	/// Determine whether the identifier II is a typo for the name of
	/// the class type currently being defined. If so, update it to the identifier
	/// that should have been used.
	bool Sema::isCurrentClassNameTypo(IdentifierInfo &II, const CXXScopeSpec SS) {
	assert(getLangOpts().CPlusPlus && "No class names in C!");

	if (!getLangOpts().SpellChecking)
	return false;

	CXXRecordDecl *CurDecl;
	if (SS && SS->isSet() && !SS->isInvalid()) {
	DeclContext DC = computeDeclContext(SS, true);
	CurDecl = dyn_cast_or_null<CXXRecordDecl>(DC);
	} else
	CurDecl = dyn_cast_or_null<CXXRecordDecl>(CurContext);

	if (CurDecl && CurDecl->getIdentifier() && II != CurDecl->getIdentifier() &&
	3 * II->getName().edit_distance(CurDecl->getIdentifier()->getName())
	< II->getLength()) {
	II = CurDecl->getIdentifier();
	return true;
	}

	return false;
	}

	/// Determine whether the given class is a base class of the given
	/// class, including looking at dependent bases.
	static bool findCircularInheritance(const CXXRecordDecl *Class,
	const CXXRecordDecl *Current) {
	SmallVector<const CXXRecordDecl*, 8> Queue;

	Class = Class->getCanonicalDecl();
	while (true) {
	for (const auto &I : Current->bases()) {
	CXXRecordDecl *Base = I.getType()->getAsCXXRecordDecl();
	if (!Base)
	continue;

	Base = Base->getDefinition();
	if (!Base)
	continue;

	if (Base->getCanonicalDecl() == Class)
	return true;

	Queue.push_back(Base);
	}

	if (Queue.empty())
	return false;

	Current = Queue.pop_back_val();
	}

	return false;
	}

	/// Check the validity of a C++ base class specifier.
	///
	/// \returns a new CXXBaseSpecifier if well-formed, emits diagnostics
	/// and returns NULL otherwise.
	CXXBaseSpecifier *
	Sema::CheckBaseSpecifier(CXXRecordDecl *Class,
	SourceRange SpecifierRange,
	bool Virtual, AccessSpecifier Access,
	TypeSourceInfo *TInfo,
	SourceLocation EllipsisLoc) {
	QualType BaseType = TInfo->getType();

	// C++ [class.union]p1:
	// A union shall not have base classes.
	if (Class->isUnion()) {
	Diag(Class->getLocation(), diag::err_base_clause_on_union)
	<< SpecifierRange;
	return nullptr;
	}

	if (EllipsisLoc.isValid() &&
	!TInfo->getType()->containsUnexpandedParameterPack()) {
	Diag(EllipsisLoc, diag::err_pack_expansion_without_parameter_packs)
	<< TInfo->getTypeLoc().getSourceRange();
	EllipsisLoc = SourceLocation();
	}

	SourceLocation BaseLoc = TInfo->getTypeLoc().getBeginLoc();

	if (BaseType->isDependentType()) {
	// Make sure that we don't have circular inheritance among our dependent
	// bases. For non-dependent bases, the check for completeness below handles
	// this.
	if (CXXRecordDecl *BaseDecl = BaseType->getAsCXXRecordDecl()) {
	if (BaseDecl->getCanonicalDecl() == Class->getCanonicalDecl() \|\|
	((BaseDecl = BaseDecl->getDefinition()) &&
	findCircularInheritance(Class, BaseDecl))) {
	Diag(BaseLoc, diag::err_circular_inheritance)
	<< BaseType << Context.getTypeDeclType(Class);

	if (BaseDecl->getCanonicalDecl() != Class->getCanonicalDecl())
	Diag(BaseDecl->getLocation(), diag::note_previous_decl)
	<< BaseType;

	return nullptr;
	}
	}

	return new (Context) CXXBaseSpecifier(SpecifierRange, Virtual,
	Class->getTagKind() == TTK_Class,
	Access, TInfo, EllipsisLoc);
	}

	// Base specifiers must be record types.
	if (!BaseType->isRecordType()) {
	Diag(BaseLoc, diag::err_base_must_be_class) << SpecifierRange;
	return nullptr;
	}

	// C++ [class.union]p1:
	// A union shall not be used as a base class.
	if (BaseType->isUnionType()) {
	Diag(BaseLoc, diag::err_union_as_base_class) << SpecifierRange;
	return nullptr;
	}

	// For the MS ABI, propagate DLL attributes to base class templates.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft()) {
	if (Attr *ClassAttr = getDLLAttr(Class)) {
	if (auto *BaseTemplate = dyn_cast_or_null<ClassTemplateSpecializationDecl>(
	BaseType->getAsCXXRecordDecl())) {
	propagateDLLAttrToBaseClassTemplate(Class, ClassAttr, BaseTemplate,
	BaseLoc);
	}
	}
	}

	// C++ [class.derived]p2:
	// The class-name in a base-specifier shall not be an incompletely
	// defined class.
	if (RequireCompleteType(BaseLoc, BaseType,
	diag::err_incomplete_base_class, SpecifierRange)) {
	Class->setInvalidDecl();
	return nullptr;
	}

	// If the base class is polymorphic or isn't empty, the new one is/isn't, too.
	RecordDecl *BaseDecl = BaseType->getAs<RecordType>()->getDecl();
	assert(BaseDecl && "Record type has no declaration");
	BaseDecl = BaseDecl->getDefinition();
	assert(BaseDecl && "Base type is not incomplete, but has no definition");
	CXXRecordDecl *CXXBaseDecl = cast<CXXRecordDecl>(BaseDecl);
	assert(CXXBaseDecl && "Base type is not a C++ type");

	// Microsoft docs say:
	// "If a base-class has a code_seg attribute, derived classes must have the
	// same attribute."
	const auto *BaseCSA = CXXBaseDecl->getAttr<CodeSegAttr>();
	const auto *DerivedCSA = Class->getAttr<CodeSegAttr>();
	if ((DerivedCSA \|\| BaseCSA) &&
	(!BaseCSA \|\| !DerivedCSA \|\| BaseCSA->getName() != DerivedCSA->getName())) {
	Diag(Class->getLocation(), diag::err_mismatched_code_seg_base);
	Diag(CXXBaseDecl->getLocation(), diag::note_base_class_specified_here)
	<< CXXBaseDecl;
	return nullptr;
	}

	// A class which contains a flexible array member is not suitable for use as a
	// base class:
	// - If the layout determines that a base comes before another base,
	// the flexible array member would index into the subsequent base.
	// - If the layout determines that base comes before the derived class,
	// the flexible array member would index into the derived class.
	if (CXXBaseDecl->hasFlexibleArrayMember()) {
	Diag(BaseLoc, diag::err_base_class_has_flexible_array_member)
	<< CXXBaseDecl->getDeclName();
	return nullptr;
	}

	// C++ [class]p3:
	// If a class is marked final and it appears as a base-type-specifier in
	// base-clause, the program is ill-formed.
	if (FinalAttr *FA = CXXBaseDecl->getAttr<FinalAttr>()) {
	Diag(BaseLoc, diag::err_class_marked_final_used_as_base)
	<< CXXBaseDecl->getDeclName()
	<< FA->isSpelledAsSealed();
	Diag(CXXBaseDecl->getLocation(), diag::note_entity_declared_at)
	<< CXXBaseDecl->getDeclName() << FA->getRange();
	return nullptr;
	}

	if (BaseDecl->isInvalidDecl())
	Class->setInvalidDecl();

	// Create the base specifier.
	return new (Context) CXXBaseSpecifier(SpecifierRange, Virtual,
	Class->getTagKind() == TTK_Class,
	Access, TInfo, EllipsisLoc);
	}

	/// ActOnBaseSpecifier - Parsed a base specifier. A base specifier is
	/// one entry in the base class list of a class specifier, for
	/// example:
	/// class foo : public bar, virtual private baz {
	/// 'public bar' and 'virtual private baz' are each base-specifiers.
	BaseResult
	Sema::ActOnBaseSpecifier(Decl *classdecl, SourceRange SpecifierRange,
	ParsedAttributes &Attributes,
	bool Virtual, AccessSpecifier Access,
	ParsedType basetype, SourceLocation BaseLoc,
	SourceLocation EllipsisLoc) {
	if (!classdecl)
	return true;

	AdjustDeclIfTemplate(classdecl);
	CXXRecordDecl *Class = dyn_cast<CXXRecordDecl>(classdecl);
	if (!Class)
	return true;

	// We haven't yet attached the base specifiers.
	Class->setIsParsingBaseSpecifiers();

	// We do not support any C++11 attributes on base-specifiers yet.
	// Diagnose any attributes we see.
	for (const ParsedAttr &AL : Attributes) {
	if (AL.isInvalid() \|\| AL.getKind() == ParsedAttr::IgnoredAttribute)
	continue;
	Diag(AL.getLoc(), AL.getKind() == ParsedAttr::UnknownAttribute
	? (unsigned)diag::warn_unknown_attribute_ignored
	: (unsigned)diag::err_base_specifier_attribute)
	<< AL.getName();
	}

	TypeSourceInfo *TInfo = nullptr;
	GetTypeFromParser(basetype, &TInfo);

	if (EllipsisLoc.isInvalid() &&
	DiagnoseUnexpandedParameterPack(SpecifierRange.getBegin(), TInfo,
	UPPC_BaseType))
	return true;

	if (CXXBaseSpecifier *BaseSpec = CheckBaseSpecifier(Class, SpecifierRange,
	Virtual, Access, TInfo,
	EllipsisLoc))
	return BaseSpec;
	else
	Class->setInvalidDecl();

	return true;
	}

	/// Use small set to collect indirect bases. As this is only used
	/// locally, there's no need to abstract the small size parameter.
	typedef llvm::SmallPtrSet<QualType, 4> IndirectBaseSet;

	/// Recursively add the bases of Type. Don't add Type itself.
	static void
	NoteIndirectBases(ASTContext &Context, IndirectBaseSet &Set,
	const QualType &Type)
	{
	// Even though the incoming type is a base, it might not be
	// a class -- it could be a template parm, for instance.
	if (auto Rec = Type->getAs<RecordType>()) {
	auto Decl = Rec->getAsCXXRecordDecl();

	// Iterate over its bases.
	for (const auto &BaseSpec : Decl->bases()) {
	QualType Base = Context.getCanonicalType(BaseSpec.getType())
	.getUnqualifiedType();
	if (Set.insert(Base).second)
	// If we've not already seen it, recurse.
	NoteIndirectBases(Context, Set, Base);
	}
	}
	}

	/// Performs the actual work of attaching the given base class
	/// specifiers to a C++ class.
	bool Sema::AttachBaseSpecifiers(CXXRecordDecl *Class,
	MutableArrayRef<CXXBaseSpecifier *> Bases) {
	if (Bases.empty())
	return false;

	// Used to keep track of which base types we have already seen, so
	// that we can properly diagnose redundant direct base types. Note
	// that the key is always the unqualified canonical type of the base
	// class.
	std::map<QualType, CXXBaseSpecifier*, QualTypeOrdering> KnownBaseTypes;

	// Used to track indirect bases so we can see if a direct base is
	// ambiguous.
	IndirectBaseSet IndirectBaseTypes;

	// Copy non-redundant base specifiers into permanent storage.
	unsigned NumGoodBases = 0;
	bool Invalid = false;
	for (unsigned idx = 0; idx < Bases.size(); ++idx) {
	QualType NewBaseType
	= Context.getCanonicalType(Bases[idx]->getType());
	NewBaseType = NewBaseType.getLocalUnqualifiedType();

	CXXBaseSpecifier *&KnownBase = KnownBaseTypes[NewBaseType];
	if (KnownBase) {
	// C++ [class.mi]p3:
	// A class shall not be specified as a direct base class of a
	// derived class more than once.
	Diag(Bases[idx]->getBeginLoc(), diag::err_duplicate_base_class)
	<< KnownBase->getType() << Bases[idx]->getSourceRange();

	// Delete the duplicate base class specifier; we're going to
	// overwrite its pointer later.
	Context.Deallocate(Bases[idx]);

	Invalid = true;
	} else {
	// Okay, add this new base class.
	KnownBase = Bases[idx];
	Bases[NumGoodBases++] = Bases[idx];

	// Note this base's direct & indirect bases, if there could be ambiguity.
	if (Bases.size() > 1)
	NoteIndirectBases(Context, IndirectBaseTypes, NewBaseType);

	if (const RecordType *Record = NewBaseType->getAs<RecordType>()) {
	const CXXRecordDecl *RD = cast<CXXRecordDecl>(Record->getDecl());
	if (Class->isInterface() &&
	(!RD->isInterfaceLike() \|\|
	KnownBase->getAccessSpecifier() != AS_public)) {
	// The Microsoft extension __interface does not permit bases that
	// are not themselves public interfaces.
	Diag(KnownBase->getBeginLoc(), diag::err_invalid_base_in_interface)
	<< getRecordDiagFromTagKind(RD->getTagKind()) << RD
	<< RD->getSourceRange();
	Invalid = true;
	}
	if (RD->hasAttr<WeakAttr>())
	Class->addAttr(WeakAttr::CreateImplicit(Context));
	}
	}
	}

	// Attach the remaining base class specifiers to the derived class.
	Class->setBases(Bases.data(), NumGoodBases);

	// Check that the only base classes that are duplicate are virtual.
	for (unsigned idx = 0; idx < NumGoodBases; ++idx) {
	// Check whether this direct base is inaccessible due to ambiguity.
	QualType BaseType = Bases[idx]->getType();

	// Skip all dependent types in templates being used as base specifiers.
	// Checks below assume that the base specifier is a CXXRecord.
	if (BaseType->isDependentType())
	continue;

	CanQualType CanonicalBase = Context.getCanonicalType(BaseType)
	.getUnqualifiedType();

	if (IndirectBaseTypes.count(CanonicalBase)) {
	CXXBasePaths Paths(/FindAmbiguities=/true, /RecordPaths=/true,
	/DetectVirtual=/true);
	bool found
	= Class->isDerivedFrom(CanonicalBase->getAsCXXRecordDecl(), Paths);
	assert(found);
	(void)found;

	if (Paths.isAmbiguous(CanonicalBase))
	Diag(Bases[idx]->getBeginLoc(), diag::warn_inaccessible_base_class)
	<< BaseType << getAmbiguousPathsDisplayString(Paths)
	<< Bases[idx]->getSourceRange();
	else
	assert(Bases[idx]->isVirtual());
	}

	// Delete the base class specifier, since its data has been copied
	// into the CXXRecordDecl.
	Context.Deallocate(Bases[idx]);
	}

	return Invalid;
	}

	/// ActOnBaseSpecifiers - Attach the given base specifiers to the
	/// class, after checking whether there are any duplicate base
	/// classes.
	void Sema::ActOnBaseSpecifiers(Decl *ClassDecl,
	MutableArrayRef<CXXBaseSpecifier *> Bases) {
	if (!ClassDecl \|\| Bases.empty())
	return;

	AdjustDeclIfTemplate(ClassDecl);
	AttachBaseSpecifiers(cast<CXXRecordDecl>(ClassDecl), Bases);
	}

	/// Determine whether the type \p Derived is a C++ class that is
	/// derived from the type \p Base.
	bool Sema::IsDerivedFrom(SourceLocation Loc, QualType Derived, QualType Base) {
	if (!getLangOpts().CPlusPlus)
	return false;

	CXXRecordDecl *DerivedRD = Derived->getAsCXXRecordDecl();
	if (!DerivedRD)
	return false;

	CXXRecordDecl *BaseRD = Base->getAsCXXRecordDecl();
	if (!BaseRD)
	return false;

	// If either the base or the derived type is invalid, don't try to
	// check whether one is derived from the other.
	if (BaseRD->isInvalidDecl() \|\| DerivedRD->isInvalidDecl())
	return false;

	// FIXME: In a modules build, do we need the entire path to be visible for us
	// to be able to use the inheritance relationship?
	if (!isCompleteType(Loc, Derived) && !DerivedRD->isBeingDefined())
	return false;

	return DerivedRD->isDerivedFrom(BaseRD);
	}

	/// Determine whether the type \p Derived is a C++ class that is
	/// derived from the type \p Base.
	bool Sema::IsDerivedFrom(SourceLocation Loc, QualType Derived, QualType Base,
	CXXBasePaths &Paths) {
	if (!getLangOpts().CPlusPlus)
	return false;

	CXXRecordDecl *DerivedRD = Derived->getAsCXXRecordDecl();
	if (!DerivedRD)
	return false;

	CXXRecordDecl *BaseRD = Base->getAsCXXRecordDecl();
	if (!BaseRD)
	return false;

	if (!isCompleteType(Loc, Derived) && !DerivedRD->isBeingDefined())
	return false;

	return DerivedRD->isDerivedFrom(BaseRD, Paths);
	}

	static void BuildBasePathArray(const CXXBasePath &Path,
	CXXCastPath &BasePathArray) {
	// We first go backward and check if we have a virtual base.
	// FIXME: It would be better if CXXBasePath had the base specifier for
	// the nearest virtual base.
	unsigned Start = 0;
	for (unsigned I = Path.size(); I != 0; --I) {
	if (Path[I - 1].Base->isVirtual()) {
	Start = I - 1;
	break;
	}
	}

	// Now add all bases.
	for (unsigned I = Start, E = Path.size(); I != E; ++I)
	BasePathArray.push_back(const_cast<CXXBaseSpecifier*>(Path[I].Base));
	}


	void Sema::BuildBasePathArray(const CXXBasePaths &Paths,
	CXXCastPath &BasePathArray) {
	assert(BasePathArray.empty() && "Base path array must be empty!");
	assert(Paths.isRecordingPaths() && "Must record paths!");
	return ::BuildBasePathArray(Paths.front(), BasePathArray);
	}
	/// CheckDerivedToBaseConversion - Check whether the Derived-to-Base
	/// conversion (where Derived and Base are class types) is
	/// well-formed, meaning that the conversion is unambiguous (and
	/// that all of the base classes are accessible). Returns true
	/// and emits a diagnostic if the code is ill-formed, returns false
	/// otherwise. Loc is the location where this routine should point to
	/// if there is an error, and Range is the source range to highlight
	/// if there is an error.
	///
	/// If either InaccessibleBaseID or AmbigiousBaseConvID are 0, then the
	/// diagnostic for the respective type of error will be suppressed, but the
	/// check for ill-formed code will still be performed.
	bool
	Sema::CheckDerivedToBaseConversion(QualType Derived, QualType Base,
	unsigned InaccessibleBaseID,
	unsigned AmbigiousBaseConvID,
	SourceLocation Loc, SourceRange Range,
	DeclarationName Name,
	CXXCastPath *BasePath,
	bool IgnoreAccess) {
	// First, determine whether the path from Derived to Base is
	// ambiguous. This is slightly more expensive than checking whether
	// the Derived to Base conversion exists, because here we need to
	// explore multiple paths to determine if there is an ambiguity.
	CXXBasePaths Paths(/FindAmbiguities=/true, /RecordPaths=/true,
	/DetectVirtual=/false);
	bool DerivationOkay = IsDerivedFrom(Loc, Derived, Base, Paths);
	if (!DerivationOkay)
	return true;

	const CXXBasePath *Path = nullptr;
	if (!Paths.isAmbiguous(Context.getCanonicalType(Base).getUnqualifiedType()))
	Path = &Paths.front();

	// For MSVC compatibility, check if Derived directly inherits from Base. Clang
	// warns about this hierarchy under -Winaccessible-base, but MSVC allows the
	// user to access such bases.
	if (!Path && getLangOpts().MSVCCompat) {
	for (const CXXBasePath &PossiblePath : Paths) {
	if (PossiblePath.size() == 1) {
	Path = &PossiblePath;
	if (AmbigiousBaseConvID)
	Diag(Loc, diag::ext_ms_ambiguous_direct_base)
	<< Base << Derived << Range;
	break;
	}
	}
	}

	if (Path) {
	if (!IgnoreAccess) {
	// Check that the base class can be accessed.
	switch (
	CheckBaseClassAccess(Loc, Base, Derived, *Path, InaccessibleBaseID)) {
	case AR_inaccessible:
	return true;
	case AR_accessible:
	case AR_dependent:
	case AR_delayed:
	break;
	}
	}

	// Build a base path if necessary.
	if (BasePath)
	::BuildBasePathArray(Path, BasePath);
	return false;
	}

	if (AmbigiousBaseConvID) {
	// We know that the derived-to-base conversion is ambiguous, and
	// we're going to produce a diagnostic. Perform the derived-to-base
	// search just one more time to compute all of the possible paths so
	// that we can print them out. This is more expensive than any of
	// the previous derived-to-base checks we've done, but at this point
	// performance isn't as much of an issue.
	Paths.clear();
	Paths.setRecordingPaths(true);
	bool StillOkay = IsDerivedFrom(Loc, Derived, Base, Paths);
	assert(StillOkay && "Can only be used with a derived-to-base conversion");
	(void)StillOkay;

	// Build up a textual representation of the ambiguous paths, e.g.,
	// D -> B -> A, that will be used to illustrate the ambiguous
	// conversions in the diagnostic. We only print one of the paths
	// to each base class subobject.
	std::string PathDisplayStr = getAmbiguousPathsDisplayString(Paths);

	Diag(Loc, AmbigiousBaseConvID)
	<< Derived << Base << PathDisplayStr << Range << Name;
	}
	return true;
	}

	bool
	Sema::CheckDerivedToBaseConversion(QualType Derived, QualType Base,
	SourceLocation Loc, SourceRange Range,
	CXXCastPath *BasePath,
	bool IgnoreAccess) {
	return CheckDerivedToBaseConversion(
	Derived, Base, diag::err_upcast_to_inaccessible_base,
	diag::err_ambiguous_derived_to_base_conv, Loc, Range, DeclarationName(),
	BasePath, IgnoreAccess);
	}


	/// Builds a string representing ambiguous paths from a
	/// specific derived class to different subobjects of the same base
	/// class.
	///
	/// This function builds a string that can be used in error messages
	/// to show the different paths that one can take through the
	/// inheritance hierarchy to go from the derived class to different
	/// subobjects of a base class. The result looks something like this:
	/// @code
	/// struct D -> struct B -> struct A
	/// struct D -> struct C -> struct A
	/// @endcode
	std::string Sema::getAmbiguousPathsDisplayString(CXXBasePaths &Paths) {
	std::string PathDisplayStr;
	std::set<unsigned> DisplayedPaths;
	for (CXXBasePaths::paths_iterator Path = Paths.begin();
	Path != Paths.end(); ++Path) {
	if (DisplayedPaths.insert(Path->back().SubobjectNumber).second) {
	// We haven't displayed a path to this particular base
	// class subobject yet.
	PathDisplayStr += "\n ";
	PathDisplayStr += Context.getTypeDeclType(Paths.getOrigin()).getAsString();
	for (CXXBasePath::const_iterator Element = Path->begin();
	Element != Path->end(); ++Element)
	PathDisplayStr += " -> " + Element->Base->getType().getAsString();
	}
	}

	return PathDisplayStr;
	}

	//===----------------------------------------------------------------------===//
	// C++ class member Handling
	//===----------------------------------------------------------------------===//

	/// ActOnAccessSpecifier - Parsed an access specifier followed by a colon.
	bool Sema::ActOnAccessSpecifier(AccessSpecifier Access, SourceLocation ASLoc,
	SourceLocation ColonLoc,
	const ParsedAttributesView &Attrs) {
	assert(Access != AS_none && "Invalid kind for syntactic access specifier!");
	AccessSpecDecl *ASDecl = AccessSpecDecl::Create(Context, Access, CurContext,
	ASLoc, ColonLoc);
	CurContext->addHiddenDecl(ASDecl);
	return ProcessAccessDeclAttributeList(ASDecl, Attrs);
	}

	/// CheckOverrideControl - Check C++11 override control semantics.
	void Sema::CheckOverrideControl(NamedDecl *D) {
	if (D->isInvalidDecl())
	return;

	// We only care about "override" and "final" declarations.
	if (!D->hasAttr<OverrideAttr>() && !D->hasAttr<FinalAttr>())
	return;

	CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(D);

	// We can't check dependent instance methods.
	if (MD && MD->isInstance() &&
	(MD->getParent()->hasAnyDependentBases() \|\|
	MD->getType()->isDependentType()))
	return;

	if (MD && !MD->isVirtual()) {
	// If we have a non-virtual method, check if if hides a virtual method.
	// (In that case, it's most likely the method has the wrong type.)
	SmallVector<CXXMethodDecl *, 8> OverloadedMethods;
	FindHiddenVirtualMethods(MD, OverloadedMethods);

	if (!OverloadedMethods.empty()) {
	if (OverrideAttr *OA = D->getAttr<OverrideAttr>()) {
	Diag(OA->getLocation(),
	diag::override_keyword_hides_virtual_member_function)
	<< "override" << (OverloadedMethods.size() > 1);
	} else if (FinalAttr *FA = D->getAttr<FinalAttr>()) {
	Diag(FA->getLocation(),
	diag::override_keyword_hides_virtual_member_function)
	<< (FA->isSpelledAsSealed() ? "sealed" : "final")
	<< (OverloadedMethods.size() > 1);
	}
	NoteHiddenVirtualMethods(MD, OverloadedMethods);
	MD->setInvalidDecl();
	return;
	}
	// Fall through into the general case diagnostic.
	// FIXME: We might want to attempt typo correction here.
	}

	if (!MD \|\| !MD->isVirtual()) {
	if (OverrideAttr *OA = D->getAttr<OverrideAttr>()) {
	Diag(OA->getLocation(),
	diag::override_keyword_only_allowed_on_virtual_member_functions)
	<< "override" << FixItHint::CreateRemoval(OA->getLocation());
	D->dropAttr<OverrideAttr>();
	}
	if (FinalAttr *FA = D->getAttr<FinalAttr>()) {
	Diag(FA->getLocation(),
	diag::override_keyword_only_allowed_on_virtual_member_functions)
	<< (FA->isSpelledAsSealed() ? "sealed" : "final")
	<< FixItHint::CreateRemoval(FA->getLocation());
	D->dropAttr<FinalAttr>();
	}
	return;
	}

	// C++11 [class.virtual]p5:
	// If a function is marked with the virt-specifier override and
	// does not override a member function of a base class, the program is
	// ill-formed.
	bool HasOverriddenMethods = MD->size_overridden_methods() != 0;
	if (MD->hasAttr<OverrideAttr>() && !HasOverriddenMethods)
	Diag(MD->getLocation(), diag::err_function_marked_override_not_overriding)
	<< MD->getDeclName();
	}

	void Sema::DiagnoseAbsenceOfOverrideControl(NamedDecl *D) {
	if (D->isInvalidDecl() \|\| D->hasAttr<OverrideAttr>())
	return;
	CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(D);
	if (!MD \|\| MD->isImplicit() \|\| MD->hasAttr<FinalAttr>())
	return;

	SourceLocation Loc = MD->getLocation();
	SourceLocation SpellingLoc = Loc;
	if (getSourceManager().isMacroArgExpansion(Loc))
	SpellingLoc = getSourceManager().getImmediateExpansionRange(Loc).getBegin();
	SpellingLoc = getSourceManager().getSpellingLoc(SpellingLoc);
	if (SpellingLoc.isValid() && getSourceManager().isInSystemHeader(SpellingLoc))
	return;

	if (MD->size_overridden_methods() > 0) {
	unsigned DiagID = isa<CXXDestructorDecl>(MD)
	? diag::warn_destructor_marked_not_override_overriding
	: diag::warn_function_marked_not_override_overriding;
	Diag(MD->getLocation(), DiagID) << MD->getDeclName();
	const CXXMethodDecl OMD = MD->begin_overridden_methods();
	Diag(OMD->getLocation(), diag::note_overridden_virtual_function);
	}
	}

	/// CheckIfOverriddenFunctionIsMarkedFinal - Checks whether a virtual member
	/// function overrides a virtual member function marked 'final', according to
	/// C++11 [class.virtual]p4.
	bool Sema::CheckIfOverriddenFunctionIsMarkedFinal(const CXXMethodDecl *New,
	const CXXMethodDecl *Old) {
	FinalAttr *FA = Old->getAttr<FinalAttr>();
	if (!FA)
	return false;

	Diag(New->getLocation(), diag::err_final_function_overridden)
	<< New->getDeclName()
	<< FA->isSpelledAsSealed();
	Diag(Old->getLocation(), diag::note_overridden_virtual_function);
	return true;
	}

	static bool InitializationHasSideEffects(const FieldDecl &FD) {
	const Type *T = FD.getType()->getBaseElementTypeUnsafe();
	// FIXME: Destruction of ObjC lifetime types has side-effects.
	if (const CXXRecordDecl *RD = T->getAsCXXRecordDecl())
	return !RD->isCompleteDefinition() \|\|
	!RD->hasTrivialDefaultConstructor() \|\|
	!RD->hasTrivialDestructor();
	return false;
	}

	static const ParsedAttr *getMSPropertyAttr(const ParsedAttributesView &list) {
	ParsedAttributesView::const_iterator Itr =
	llvm::find_if(list, [](const ParsedAttr &AL) {
	return AL.isDeclspecPropertyAttribute();
	});
	if (Itr != list.end())
	return &*Itr;
	return nullptr;
	}

	// Check if there is a field shadowing.
	void Sema::CheckShadowInheritedFields(const SourceLocation &Loc,
	DeclarationName FieldName,
	const CXXRecordDecl *RD,
	bool DeclIsField) {
	if (Diags.isIgnored(diag::warn_shadow_field, Loc))
	return;

	// To record a shadowed field in a base
	std::map<CXXRecordDecl, NamedDecl> Bases;
	auto FieldShadowed = [&](const CXXBaseSpecifier *Specifier,
	CXXBasePath &Path) {
	const auto Base = Specifier->getType()->getAsCXXRecordDecl();
	// Record an ambiguous path directly
	if (Bases.find(Base) != Bases.end())
	return true;
	for (const auto Field : Base->lookup(FieldName)) {
	if ((isa<FieldDecl>(Field) \|\| isa<IndirectFieldDecl>(Field)) &&
	Field->getAccess() != AS_private) {
	assert(Field->getAccess() != AS_none);
	assert(Bases.find(Base) == Bases.end());
	Bases[Base] = Field;
	return true;
	}
	}
	return false;
	};

	CXXBasePaths Paths(/FindAmbiguities=/true, /RecordPaths=/true,
	/DetectVirtual=/true);
	if (!RD->lookupInBases(FieldShadowed, Paths))
	return;

	for (const auto &P : Paths) {
	auto Base = P.back().Base->getType()->getAsCXXRecordDecl();
	auto It = Bases.find(Base);
	// Skip duplicated bases
	if (It == Bases.end())
	continue;
	auto BaseField = It->second;
	assert(BaseField->getAccess() != AS_private);
	if (AS_none !=
	CXXRecordDecl::MergeAccess(P.Access, BaseField->getAccess())) {
	Diag(Loc, diag::warn_shadow_field)
	<< FieldName << RD << Base << DeclIsField;
	Diag(BaseField->getLocation(), diag::note_shadow_field);
	Bases.erase(It);
	}
	}
	}

	/// ActOnCXXMemberDeclarator - This is invoked when a C++ class member
	/// declarator is parsed. 'AS' is the access specifier, 'BW' specifies the
	/// bitfield width if there is one, 'InitExpr' specifies the initializer if
	/// one has been parsed, and 'InitStyle' is set if an in-class initializer is
	/// present (but parsing it has been deferred).
	NamedDecl *
	Sema::ActOnCXXMemberDeclarator(Scope *S, AccessSpecifier AS, Declarator &D,
	MultiTemplateParamsArg TemplateParameterLists,
	Expr *BW, const VirtSpecifiers &VS,
	InClassInitStyle InitStyle) {
	const DeclSpec &DS = D.getDeclSpec();
	DeclarationNameInfo NameInfo = GetNameForDeclarator(D);
	DeclarationName Name = NameInfo.getName();
	SourceLocation Loc = NameInfo.getLoc();

	// For anonymous bitfields, the location should point to the type.
	if (Loc.isInvalid())
	Loc = D.getBeginLoc();

	Expr BitWidth = static_cast<Expr>(BW);

	assert(isa<CXXRecordDecl>(CurContext));
	assert(!DS.isFriendSpecified());

	bool isFunc = D.isDeclarationOfFunction();
	const ParsedAttr *MSPropertyAttr =
	getMSPropertyAttr(D.getDeclSpec().getAttributes());

	if (cast<CXXRecordDecl>(CurContext)->isInterface()) {
	// The Microsoft extension __interface only permits public member functions
	// and prohibits constructors, destructors, operators, non-public member
	// functions, static methods and data members.
	unsigned InvalidDecl;
	bool ShowDeclName = true;
	if (!isFunc &&
	(DS.getStorageClassSpec() == DeclSpec::SCS_typedef \|\| MSPropertyAttr))
	InvalidDecl = 0;
	else if (!isFunc)
	InvalidDecl = 1;
	else if (AS != AS_public)
	InvalidDecl = 2;
	else if (DS.getStorageClassSpec() == DeclSpec::SCS_static)
	InvalidDecl = 3;
	else switch (Name.getNameKind()) {
	case DeclarationName::CXXConstructorName:
	InvalidDecl = 4;
	ShowDeclName = false;
	break;

	case DeclarationName::CXXDestructorName:
	InvalidDecl = 5;
	ShowDeclName = false;
	break;

	case DeclarationName::CXXOperatorName:
	case DeclarationName::CXXConversionFunctionName:
	InvalidDecl = 6;
	break;

	default:
	InvalidDecl = 0;
	break;
	}

	if (InvalidDecl) {
	if (ShowDeclName)
	Diag(Loc, diag::err_invalid_member_in_interface)
	<< (InvalidDecl-1) << Name;
	else
	Diag(Loc, diag::err_invalid_member_in_interface)
	<< (InvalidDecl-1) << "";
	return nullptr;
	}
	}

	// C++ 9.2p6: A member shall not be declared to have automatic storage
	// duration (auto, register) or with the extern storage-class-specifier.
	// C++ 7.1.1p8: The mutable specifier can be applied only to names of class
	// data members and cannot be applied to names declared const or static,
	// and cannot be applied to reference members.
	switch (DS.getStorageClassSpec()) {
	case DeclSpec::SCS_unspecified:
	case DeclSpec::SCS_typedef:
	case DeclSpec::SCS_static:
	break;
	case DeclSpec::SCS_mutable:
	if (isFunc) {
	Diag(DS.getStorageClassSpecLoc(), diag::err_mutable_function);

	// FIXME: It would be nicer if the keyword was ignored only for this
	// declarator. Otherwise we could get follow-up errors.
	D.getMutableDeclSpec().ClearStorageClassSpecs();
	}
	break;
	default:
	Diag(DS.getStorageClassSpecLoc(),
	diag::err_storageclass_invalid_for_member);
	D.getMutableDeclSpec().ClearStorageClassSpecs();
	break;
	}

	bool isInstField = ((DS.getStorageClassSpec() == DeclSpec::SCS_unspecified \|\|
	DS.getStorageClassSpec() == DeclSpec::SCS_mutable) &&
	!isFunc);

	if (DS.hasConstexprSpecifier() && isInstField) {
	SemaDiagnosticBuilder B =
	Diag(DS.getConstexprSpecLoc(), diag::err_invalid_constexpr_member);
	SourceLocation ConstexprLoc = DS.getConstexprSpecLoc();
	if (InitStyle == ICIS_NoInit) {
	B << 0 << 0;
	if (D.getDeclSpec().getTypeQualifiers() & DeclSpec::TQ_const)
	B << FixItHint::CreateRemoval(ConstexprLoc);
	else {
	B << FixItHint::CreateReplacement(ConstexprLoc, "const");
	D.getMutableDeclSpec().ClearConstexprSpec();
	const char *PrevSpec;
	unsigned DiagID;
	bool Failed = D.getMutableDeclSpec().SetTypeQual(
	DeclSpec::TQ_const, ConstexprLoc, PrevSpec, DiagID, getLangOpts());
	(void)Failed;
	assert(!Failed && "Making a constexpr member const shouldn't fail");
	}
	} else {
	B << 1;
	const char *PrevSpec;
	unsigned DiagID;
	if (D.getMutableDeclSpec().SetStorageClassSpec(
	*this, DeclSpec::SCS_static, ConstexprLoc, PrevSpec, DiagID,
	Context.getPrintingPolicy())) {
	assert(DS.getStorageClassSpec() == DeclSpec::SCS_mutable &&
	"This is the only DeclSpec that should fail to be applied");
	B << 1;
	} else {
	B << 0 << FixItHint::CreateInsertion(ConstexprLoc, "static ");
	isInstField = false;
	}
	}
	}

	NamedDecl *Member;
	if (isInstField) {
	CXXScopeSpec &SS = D.getCXXScopeSpec();

	// Data members must have identifiers for names.
	if (!Name.isIdentifier()) {
	Diag(Loc, diag::err_bad_variable_name)
	<< Name;
	return nullptr;
	}

	IdentifierInfo *II = Name.getAsIdentifierInfo();

	// Member field could not be with "template" keyword.
	// So TemplateParameterLists should be empty in this case.
	if (TemplateParameterLists.size()) {
	TemplateParameterList* TemplateParams = TemplateParameterLists[0];
	if (TemplateParams->size()) {
	// There is no such thing as a member field template.
	Diag(D.getIdentifierLoc(), diag::err_template_member)
	<< II
	<< SourceRange(TemplateParams->getTemplateLoc(),
	TemplateParams->getRAngleLoc());
	} else {
	// There is an extraneous 'template<>' for this member.
	Diag(TemplateParams->getTemplateLoc(),
	diag::err_template_member_noparams)
	<< II
	<< SourceRange(TemplateParams->getTemplateLoc(),
	TemplateParams->getRAngleLoc());
	}
	return nullptr;
	}

	if (SS.isSet() && !SS.isInvalid()) {
	// The user provided a superfluous scope specifier inside a class
	// definition:
	//
	// class X {
	// int X::member;
	// };
	if (DeclContext *DC = computeDeclContext(SS, false))
	diagnoseQualifiedDeclaration(SS, DC, Name, D.getIdentifierLoc(),
	D.getName().getKind() ==
	UnqualifiedIdKind::IK_TemplateId);
	else
	Diag(D.getIdentifierLoc(), diag::err_member_qualification)
	<< Name << SS.getRange();

	SS.clear();
	}

	if (MSPropertyAttr) {
	Member = HandleMSProperty(S, cast<CXXRecordDecl>(CurContext), Loc, D,
	BitWidth, InitStyle, AS, *MSPropertyAttr);
	if (!Member)
	return nullptr;
	isInstField = false;
	} else {
	Member = HandleField(S, cast<CXXRecordDecl>(CurContext), Loc, D,
	BitWidth, InitStyle, AS);
	if (!Member)
	return nullptr;
	}

	CheckShadowInheritedFields(Loc, Name, cast<CXXRecordDecl>(CurContext));
	} else {
	Member = HandleDeclarator(S, D, TemplateParameterLists);
	if (!Member)
	return nullptr;

	// Non-instance-fields can't have a bitfield.
	if (BitWidth) {
	if (Member->isInvalidDecl()) {
	// don't emit another diagnostic.
	} else if (isa<VarDecl>(Member) \|\| isa<VarTemplateDecl>(Member)) {
	// C++ 9.6p3: A bit-field shall not be a static member.
	// "static member 'A' cannot be a bit-field"
	Diag(Loc, diag::err_static_not_bitfield)
	<< Name << BitWidth->getSourceRange();
	} else if (isa<TypedefDecl>(Member)) {
	// "typedef member 'x' cannot be a bit-field"
	Diag(Loc, diag::err_typedef_not_bitfield)
	<< Name << BitWidth->getSourceRange();
	} else {
	// A function typedef ("typedef int f(); f a;").
	// C++ 9.6p3: A bit-field shall have integral or enumeration type.
	Diag(Loc, diag::err_not_integral_type_bitfield)
	<< Name << cast<ValueDecl>(Member)->getType()
	<< BitWidth->getSourceRange();
	}

	BitWidth = nullptr;
	Member->setInvalidDecl();
	}

	NamedDecl *NonTemplateMember = Member;
	if (FunctionTemplateDecl *FunTmpl = dyn_cast<FunctionTemplateDecl>(Member))
	NonTemplateMember = FunTmpl->getTemplatedDecl();
	else if (VarTemplateDecl *VarTmpl = dyn_cast<VarTemplateDecl>(Member))
	NonTemplateMember = VarTmpl->getTemplatedDecl();

	Member->setAccess(AS);

	// If we have declared a member function template or static data member
	// template, set the access of the templated declaration as well.
	if (NonTemplateMember != Member)
	NonTemplateMember->setAccess(AS);

	// C++ [temp.deduct.guide]p3:
	// A deduction guide [...] for a member class template [shall be
	// declared] with the same access [as the template].
	if (auto *DG = dyn_cast<CXXDeductionGuideDecl>(NonTemplateMember)) {
	auto *TD = DG->getDeducedTemplate();
	// Access specifiers are only meaningful if both the template and the
	// deduction guide are from the same scope.
	if (AS != TD->getAccess() &&
	TD->getDeclContext()->getRedeclContext()->Equals(
	DG->getDeclContext()->getRedeclContext())) {
	Diag(DG->getBeginLoc(), diag::err_deduction_guide_wrong_access);
	Diag(TD->getBeginLoc(), diag::note_deduction_guide_template_access)
	<< TD->getAccess();
	const AccessSpecDecl *LastAccessSpec = nullptr;
	for (const auto *D : cast<CXXRecordDecl>(CurContext)->decls()) {
	if (const auto *AccessSpec = dyn_cast<AccessSpecDecl>(D))
	LastAccessSpec = AccessSpec;
	}
	assert(LastAccessSpec && "differing access with no access specifier");
	Diag(LastAccessSpec->getBeginLoc(), diag::note_deduction_guide_access)
	<< AS;
	}
	}
	}

	if (VS.isOverrideSpecified())
	Member->addAttr(new (Context) OverrideAttr(VS.getOverrideLoc(), Context, 0));
	if (VS.isFinalSpecified())
	Member->addAttr(new (Context) FinalAttr(VS.getFinalLoc(), Context,
	VS.isFinalSpelledSealed()));

	if (VS.getLastLocation().isValid()) {
	// Update the end location of a method that has a virt-specifiers.
	if (CXXMethodDecl *MD = dyn_cast_or_null<CXXMethodDecl>(Member))
	MD->setRangeEnd(VS.getLastLocation());
	}

	CheckOverrideControl(Member);

	assert((Name \|\| isInstField) && "No identifier for non-field ?");

	if (isInstField) {
	FieldDecl *FD = cast<FieldDecl>(Member);
	FieldCollector->Add(FD);

	if (!Diags.isIgnored(diag::warn_unused_private_field, FD->getLocation())) {
	// Remember all explicit private FieldDecls that have a name, no side
	// effects and are not part of a dependent type declaration.
	if (!FD->isImplicit() && FD->getDeclName() &&
	FD->getAccess() == AS_private &&
	!FD->hasAttr<UnusedAttr>() &&
	!FD->getParent()->isDependentContext() &&
	!InitializationHasSideEffects(*FD))
	UnusedPrivateFields.insert(FD);
	}
	}

	return Member;
	}

	namespace {
	class UninitializedFieldVisitor
	: public EvaluatedExprVisitor<UninitializedFieldVisitor> {
	Sema &S;
	// List of Decls to generate a warning on. Also remove Decls that become
	// initialized.
	llvm::SmallPtrSetImpl<ValueDecl*> &Decls;
	// List of base classes of the record. Classes are removed after their
	// initializers.
	llvm::SmallPtrSetImpl<QualType> &BaseClasses;
	// Vector of decls to be removed from the Decl set prior to visiting the
	// nodes. These Decls may have been initialized in the prior initializer.
	llvm::SmallVector<ValueDecl*, 4> DeclsToRemove;
	// If non-null, add a note to the warning pointing back to the constructor.
	const CXXConstructorDecl *Constructor;
	// Variables to hold state when processing an initializer list. When
	// InitList is true, special case initialization of FieldDecls matching
	// InitListFieldDecl.
	bool InitList;
	FieldDecl *InitListFieldDecl;
	llvm::SmallVector<unsigned, 4> InitFieldIndex;

	public:
	typedef EvaluatedExprVisitor<UninitializedFieldVisitor> Inherited;
	UninitializedFieldVisitor(Sema &S,
	llvm::SmallPtrSetImpl<ValueDecl*> &Decls,
	llvm::SmallPtrSetImpl<QualType> &BaseClasses)
	: Inherited(S.Context), S(S), Decls(Decls), BaseClasses(BaseClasses),
	Constructor(nullptr), InitList(false), InitListFieldDecl(nullptr) {}

	// Returns true if the use of ME is not an uninitialized use.
	bool IsInitListMemberExprInitialized(MemberExpr *ME,
	bool CheckReferenceOnly) {
	llvm::SmallVector<FieldDecl*, 4> Fields;
	bool ReferenceField = false;
	while (ME) {
	FieldDecl *FD = dyn_cast<FieldDecl>(ME->getMemberDecl());
	if (!FD)
	return false;
	Fields.push_back(FD);
	if (FD->getType()->isReferenceType())
	ReferenceField = true;
	ME = dyn_cast<MemberExpr>(ME->getBase()->IgnoreParenImpCasts());
	}

	// Binding a reference to an uninitialized field is not an
	// uninitialized use.
	if (CheckReferenceOnly && !ReferenceField)
	return true;

	llvm::SmallVector<unsigned, 4> UsedFieldIndex;
	// Discard the first field since it is the field decl that is being
	// initialized.
	for (auto I = Fields.rbegin() + 1, E = Fields.rend(); I != E; ++I) {
	UsedFieldIndex.push_back((*I)->getFieldIndex());
	}

	for (auto UsedIter = UsedFieldIndex.begin(),
	UsedEnd = UsedFieldIndex.end(),
	OrigIter = InitFieldIndex.begin(),
	OrigEnd = InitFieldIndex.end();
	UsedIter != UsedEnd && OrigIter != OrigEnd; ++UsedIter, ++OrigIter) {
	if (UsedIter < OrigIter)
	return true;
	if (UsedIter > OrigIter)
	break;
	}

	return false;
	}

	void HandleMemberExpr(MemberExpr *ME, bool CheckReferenceOnly,
	bool AddressOf) {
	if (isa<EnumConstantDecl>(ME->getMemberDecl()))
	return;

	// FieldME is the inner-most MemberExpr that is not an anonymous struct
	// or union.
	MemberExpr *FieldME = ME;

	bool AllPODFields = FieldME->getType().isPODType(S.Context);

	Expr *Base = ME;
	while (MemberExpr *SubME =
	dyn_cast<MemberExpr>(Base->IgnoreParenImpCasts())) {

	if (isa<VarDecl>(SubME->getMemberDecl()))
	return;

	if (FieldDecl *FD = dyn_cast<FieldDecl>(SubME->getMemberDecl()))
	if (!FD->isAnonymousStructOrUnion())
	FieldME = SubME;

	if (!FieldME->getType().isPODType(S.Context))
	AllPODFields = false;

	Base = SubME->getBase();
	}

	if (!isa<CXXThisExpr>(Base->IgnoreParenImpCasts()))
	return;

	if (AddressOf && AllPODFields)
	return;

	ValueDecl* FoundVD = FieldME->getMemberDecl();

	if (ImplicitCastExpr *BaseCast = dyn_cast<ImplicitCastExpr>(Base)) {
	while (isa<ImplicitCastExpr>(BaseCast->getSubExpr())) {
	BaseCast = cast<ImplicitCastExpr>(BaseCast->getSubExpr());
	}

	if (BaseCast->getCastKind() == CK_UncheckedDerivedToBase) {
	QualType T = BaseCast->getType();
	if (T->isPointerType() &&
	BaseClasses.count(T->getPointeeType())) {
	S.Diag(FieldME->getExprLoc(), diag::warn_base_class_is_uninit)
	<< T->getPointeeType() << FoundVD;
	}
	}
	}

	if (!Decls.count(FoundVD))
	return;

	const bool IsReference = FoundVD->getType()->isReferenceType();

	if (InitList && !AddressOf && FoundVD == InitListFieldDecl) {
	// Special checking for initializer lists.
	if (IsInitListMemberExprInitialized(ME, CheckReferenceOnly)) {
	return;
	}
	} else {
	// Prevent double warnings on use of unbounded references.
	if (CheckReferenceOnly && !IsReference)
	return;
	}

	unsigned diag = IsReference
	? diag::warn_reference_field_is_uninit
	: diag::warn_field_is_uninit;
	S.Diag(FieldME->getExprLoc(), diag) << FoundVD;
	if (Constructor)
	S.Diag(Constructor->getLocation(),
	diag::note_uninit_in_this_constructor)
	<< (Constructor->isDefaultConstructor() && Constructor->isImplicit());

	}

	void HandleValue(Expr *E, bool AddressOf) {
	E = E->IgnoreParens();

	if (MemberExpr *ME = dyn_cast<MemberExpr>(E)) {
	HandleMemberExpr(ME, false /CheckReferenceOnly/,
	AddressOf /AddressOf/);
	return;
	}

	if (ConditionalOperator *CO = dyn_cast<ConditionalOperator>(E)) {
	Visit(CO->getCond());
	HandleValue(CO->getTrueExpr(), AddressOf);
	HandleValue(CO->getFalseExpr(), AddressOf);
	return;
	}

	if (BinaryConditionalOperator *BCO =
	dyn_cast<BinaryConditionalOperator>(E)) {
	Visit(BCO->getCond());
	HandleValue(BCO->getFalseExpr(), AddressOf);
	return;
	}

	if (OpaqueValueExpr *OVE = dyn_cast<OpaqueValueExpr>(E)) {
	HandleValue(OVE->getSourceExpr(), AddressOf);
	return;
	}

	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(E)) {
	switch (BO->getOpcode()) {
	default:
	break;
	case(BO_PtrMemD):
	case(BO_PtrMemI):
	HandleValue(BO->getLHS(), AddressOf);
	Visit(BO->getRHS());
	return;
	case(BO_Comma):
	Visit(BO->getLHS());
	HandleValue(BO->getRHS(), AddressOf);
	return;
	}
	}

	Visit(E);
	}

	void CheckInitListExpr(InitListExpr *ILE) {
	InitFieldIndex.push_back(0);
	for (auto Child : ILE->children()) {
	if (InitListExpr *SubList = dyn_cast<InitListExpr>(Child)) {
	CheckInitListExpr(SubList);
	} else {
	Visit(Child);
	}
	++InitFieldIndex.back();
	}
	InitFieldIndex.pop_back();
	}

	void CheckInitializer(Expr E, const CXXConstructorDecl FieldConstructor,
	FieldDecl Field, const Type BaseClass) {
	// Remove Decls that may have been initialized in the previous
	// initializer.
	for (ValueDecl* VD : DeclsToRemove)
	Decls.erase(VD);
	DeclsToRemove.clear();

	Constructor = FieldConstructor;
	InitListExpr *ILE = dyn_cast<InitListExpr>(E);

	if (ILE && Field) {
	InitList = true;
	InitListFieldDecl = Field;
	InitFieldIndex.clear();
	CheckInitListExpr(ILE);
	} else {
	InitList = false;
	Visit(E);
	}

	if (Field)
	Decls.erase(Field);
	if (BaseClass)
	BaseClasses.erase(BaseClass->getCanonicalTypeInternal());
	}

	void VisitMemberExpr(MemberExpr *ME) {
	// All uses of unbounded reference fields will warn.
	HandleMemberExpr(ME, true /CheckReferenceOnly/, false /AddressOf/);
	}

	void VisitImplicitCastExpr(ImplicitCastExpr *E) {
	if (E->getCastKind() == CK_LValueToRValue) {
	HandleValue(E->getSubExpr(), false /AddressOf/);
	return;
	}

	Inherited::VisitImplicitCastExpr(E);
	}

	void VisitCXXConstructExpr(CXXConstructExpr *E) {
	if (E->getConstructor()->isCopyConstructor()) {
	Expr *ArgExpr = E->getArg(0);
	if (InitListExpr *ILE = dyn_cast<InitListExpr>(ArgExpr))
	if (ILE->getNumInits() == 1)
	ArgExpr = ILE->getInit(0);
	if (ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(ArgExpr))
	if (ICE->getCastKind() == CK_NoOp)
	ArgExpr = ICE->getSubExpr();
	HandleValue(ArgExpr, false /AddressOf/);
	return;
	}
	Inherited::VisitCXXConstructExpr(E);
	}

	void VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
	Expr *Callee = E->getCallee();
	if (isa<MemberExpr>(Callee)) {
	HandleValue(Callee, false /AddressOf/);
	for (auto Arg : E->arguments())
	Visit(Arg);
	return;
	}

	Inherited::VisitCXXMemberCallExpr(E);
	}

	void VisitCallExpr(CallExpr *E) {
	// Treat std::move as a use.
	if (E->isCallToStdMove()) {
	HandleValue(E->getArg(0), /AddressOf=/false);
	return;
	}

	Inherited::VisitCallExpr(E);
	}

	void VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
	Expr *Callee = E->getCallee();

	if (isa<UnresolvedLookupExpr>(Callee))
	return Inherited::VisitCXXOperatorCallExpr(E);

	Visit(Callee);
	for (auto Arg : E->arguments())
	HandleValue(Arg->IgnoreParenImpCasts(), false /AddressOf/);
	}

	void VisitBinaryOperator(BinaryOperator *E) {
	// If a field assignment is detected, remove the field from the
	// uninitiailized field set.
	if (E->getOpcode() == BO_Assign)
	if (MemberExpr *ME = dyn_cast<MemberExpr>(E->getLHS()))
	if (FieldDecl *FD = dyn_cast<FieldDecl>(ME->getMemberDecl()))
	if (!FD->getType()->isReferenceType())
	DeclsToRemove.push_back(FD);

	if (E->isCompoundAssignmentOp()) {
	HandleValue(E->getLHS(), false /AddressOf/);
	Visit(E->getRHS());
	return;
	}

	Inherited::VisitBinaryOperator(E);
	}

	void VisitUnaryOperator(UnaryOperator *E) {
	if (E->isIncrementDecrementOp()) {
	HandleValue(E->getSubExpr(), false /AddressOf/);
	return;
	}
	if (E->getOpcode() == UO_AddrOf) {
	if (MemberExpr *ME = dyn_cast<MemberExpr>(E->getSubExpr())) {
	HandleValue(ME->getBase(), true /AddressOf/);
	return;
	}
	}

	Inherited::VisitUnaryOperator(E);
	}
	};

	// Diagnose value-uses of fields to initialize themselves, e.g.
	// foo(foo)
	// where foo is not also a parameter to the constructor.
	// Also diagnose across field uninitialized use such as
	// x(y), y(x)
	// TODO: implement -Wuninitialized and fold this into that framework.
	static void DiagnoseUninitializedFields(
	Sema &SemaRef, const CXXConstructorDecl *Constructor) {

	if (SemaRef.getDiagnostics().isIgnored(diag::warn_field_is_uninit,
	Constructor->getLocation())) {
	return;
	}

	if (Constructor->isInvalidDecl())
	return;

	const CXXRecordDecl *RD = Constructor->getParent();

	if (RD->getDescribedClassTemplate())
	return;

	// Holds fields that are uninitialized.
	llvm::SmallPtrSet<ValueDecl*, 4> UninitializedFields;

	// At the beginning, all fields are uninitialized.
	for (auto *I : RD->decls()) {
	if (auto *FD = dyn_cast<FieldDecl>(I)) {
	UninitializedFields.insert(FD);
	} else if (auto *IFD = dyn_cast<IndirectFieldDecl>(I)) {
	UninitializedFields.insert(IFD->getAnonField());
	}
	}

	llvm::SmallPtrSet<QualType, 4> UninitializedBaseClasses;
	for (auto I : RD->bases())
	UninitializedBaseClasses.insert(I.getType().getCanonicalType());

	if (UninitializedFields.empty() && UninitializedBaseClasses.empty())
	return;

	UninitializedFieldVisitor UninitializedChecker(SemaRef,
	UninitializedFields,
	UninitializedBaseClasses);

	for (const auto *FieldInit : Constructor->inits()) {
	if (UninitializedFields.empty() && UninitializedBaseClasses.empty())
	break;

	Expr *InitExpr = FieldInit->getInit();
	if (!InitExpr)
	continue;

	if (CXXDefaultInitExpr *Default =
	dyn_cast<CXXDefaultInitExpr>(InitExpr)) {
	InitExpr = Default->getExpr();
	if (!InitExpr)
	continue;
	// In class initializers will point to the constructor.
	UninitializedChecker.CheckInitializer(InitExpr, Constructor,
	FieldInit->getAnyMember(),
	FieldInit->getBaseClass());
	} else {
	UninitializedChecker.CheckInitializer(InitExpr, nullptr,
	FieldInit->getAnyMember(),
	FieldInit->getBaseClass());
	}
	}
	}
	} // namespace

	/// Enter a new C++ default initializer scope. After calling this, the
	/// caller must call \ref ActOnFinishCXXInClassMemberInitializer, even if
	/// parsing or instantiating the initializer failed.
	void Sema::ActOnStartCXXInClassMemberInitializer() {
	// Create a synthetic function scope to represent the call to the constructor
	// that notionally surrounds a use of this initializer.
	PushFunctionScope();
	}

	/// This is invoked after parsing an in-class initializer for a
	/// non-static C++ class member, and after instantiating an in-class initializer
	/// in a class template. Such actions are deferred until the class is complete.
	void Sema::ActOnFinishCXXInClassMemberInitializer(Decl *D,
	SourceLocation InitLoc,
	Expr *InitExpr) {
	// Pop the notional constructor scope we created earlier.
	PopFunctionScopeInfo(nullptr, D);

	FieldDecl *FD = dyn_cast<FieldDecl>(D);
	assert((isa<MSPropertyDecl>(D) \|\| FD->getInClassInitStyle() != ICIS_NoInit) &&
	"must set init style when field is created");

	if (!InitExpr) {
	D->setInvalidDecl();
	if (FD)
	FD->removeInClassInitializer();
	return;
	}

	if (DiagnoseUnexpandedParameterPack(InitExpr, UPPC_Initializer)) {
	FD->setInvalidDecl();
	FD->removeInClassInitializer();
	return;
	}

	ExprResult Init = InitExpr;
	if (!FD->getType()->isDependentType() && !InitExpr->isTypeDependent()) {
	InitializedEntity Entity =
	InitializedEntity::InitializeMemberFromDefaultMemberInitializer(FD);
	InitializationKind Kind =
	FD->getInClassInitStyle() == ICIS_ListInit
	? InitializationKind::CreateDirectList(InitExpr->getBeginLoc(),
	InitExpr->getBeginLoc(),
	InitExpr->getEndLoc())
	: InitializationKind::CreateCopy(InitExpr->getBeginLoc(), InitLoc);
	InitializationSequence Seq(*this, Entity, Kind, InitExpr);
	Init = Seq.Perform(*this, Entity, Kind, InitExpr);
	if (Init.isInvalid()) {
	FD->setInvalidDecl();
	return;
	}
	}

	// C++11 [class.base.init]p7:
	// The initialization of each base and member constitutes a
	// full-expression.
	Init = ActOnFinishFullExpr(Init.get(), InitLoc, /DiscardedValue/ false);
	if (Init.isInvalid()) {
	FD->setInvalidDecl();
	return;
	}

	InitExpr = Init.get();

	FD->setInClassInitializer(InitExpr);
	}

	/// Find the direct and/or virtual base specifiers that
	/// correspond to the given base type, for use in base initialization
	/// within a constructor.
	static bool FindBaseInitializer(Sema &SemaRef,
	CXXRecordDecl *ClassDecl,
	QualType BaseType,
	const CXXBaseSpecifier *&DirectBaseSpec,
	const CXXBaseSpecifier *&VirtualBaseSpec) {
	// First, check for a direct base class.
	DirectBaseSpec = nullptr;
	for (const auto &Base : ClassDecl->bases()) {
	if (SemaRef.Context.hasSameUnqualifiedType(BaseType, Base.getType())) {
	// We found a direct base of this type. That's what we're
	// initializing.
	DirectBaseSpec = &Base;
	break;
	}
	}

	// Check for a virtual base class.
	// FIXME: We might be able to short-circuit this if we know in advance that
	// there are no virtual bases.
	VirtualBaseSpec = nullptr;
	if (!DirectBaseSpec \|\| !DirectBaseSpec->isVirtual()) {
	// We haven't found a base yet; search the class hierarchy for a
	// virtual base class.
	CXXBasePaths Paths(/FindAmbiguities=/true, /RecordPaths=/true,
	/DetectVirtual=/false);
	if (SemaRef.IsDerivedFrom(ClassDecl->getLocation(),
	SemaRef.Context.getTypeDeclType(ClassDecl),
	BaseType, Paths)) {
	for (CXXBasePaths::paths_iterator Path = Paths.begin();
	Path != Paths.end(); ++Path) {
	if (Path->back().Base->isVirtual()) {
	VirtualBaseSpec = Path->back().Base;
	break;
	}
	}
	}
	}

	return DirectBaseSpec \|\| VirtualBaseSpec;
	}

	/// Handle a C++ member initializer using braced-init-list syntax.
	MemInitResult
	Sema::ActOnMemInitializer(Decl *ConstructorD,
	Scope *S,
	CXXScopeSpec &SS,
	IdentifierInfo *MemberOrBase,
	ParsedType TemplateTypeTy,
	const DeclSpec &DS,
	SourceLocation IdLoc,
	Expr *InitList,
	SourceLocation EllipsisLoc) {
	return BuildMemInitializer(ConstructorD, S, SS, MemberOrBase, TemplateTypeTy,
	DS, IdLoc, InitList,
	EllipsisLoc);
	}

	/// Handle a C++ member initializer using parentheses syntax.
	MemInitResult
	Sema::ActOnMemInitializer(Decl *ConstructorD,
	Scope *S,
	CXXScopeSpec &SS,
	IdentifierInfo *MemberOrBase,
	ParsedType TemplateTypeTy,
	const DeclSpec &DS,
	SourceLocation IdLoc,
	SourceLocation LParenLoc,
	ArrayRef<Expr *> Args,
	SourceLocation RParenLoc,
	SourceLocation EllipsisLoc) {
	Expr *List = ParenListExpr::Create(Context, LParenLoc, Args, RParenLoc);
	return BuildMemInitializer(ConstructorD, S, SS, MemberOrBase, TemplateTypeTy,
	DS, IdLoc, List, EllipsisLoc);
	}

	namespace {

	// Callback to only accept typo corrections that can be a valid C++ member
	// intializer: either a non-static field member or a base class.
	class MemInitializerValidatorCCC final : public CorrectionCandidateCallback {
	public:
	explicit MemInitializerValidatorCCC(CXXRecordDecl *ClassDecl)
	: ClassDecl(ClassDecl) {}

	bool ValidateCandidate(const TypoCorrection &candidate) override {
	if (NamedDecl *ND = candidate.getCorrectionDecl()) {
	if (FieldDecl *Member = dyn_cast<FieldDecl>(ND))
	return Member->getDeclContext()->getRedeclContext()->Equals(ClassDecl);
	return isa<TypeDecl>(ND);
	}
	return false;
	}

	std::unique_ptr<CorrectionCandidateCallback> clone() override {
	return llvm::make_unique<MemInitializerValidatorCCC>(*this);
	}

	private:
	CXXRecordDecl *ClassDecl;
	};

	}

	ValueDecl Sema::tryLookupCtorInitMemberDecl(CXXRecordDecl ClassDecl,
	CXXScopeSpec &SS,
	ParsedType TemplateTypeTy,
	IdentifierInfo *MemberOrBase) {
	if (SS.getScopeRep() \|\| TemplateTypeTy)
	return nullptr;
	DeclContext::lookup_result Result = ClassDecl->lookup(MemberOrBase);
	if (Result.empty())
	return nullptr;
	ValueDecl *Member;
	if ((Member = dyn_cast<FieldDecl>(Result.front())) \|\|
	(Member = dyn_cast<IndirectFieldDecl>(Result.front())))
	return Member;
	return nullptr;
	}

	/// Handle a C++ member initializer.
	MemInitResult
	Sema::BuildMemInitializer(Decl *ConstructorD,
	Scope *S,
	CXXScopeSpec &SS,
	IdentifierInfo *MemberOrBase,
	ParsedType TemplateTypeTy,
	const DeclSpec &DS,
	SourceLocation IdLoc,
	Expr *Init,
	SourceLocation EllipsisLoc) {
	ExprResult Res = CorrectDelayedTyposInExpr(Init);
	if (!Res.isUsable())
	return true;
	Init = Res.get();

	if (!ConstructorD)
	return true;

	AdjustDeclIfTemplate(ConstructorD);

	CXXConstructorDecl *Constructor
	= dyn_cast<CXXConstructorDecl>(ConstructorD);
	if (!Constructor) {
	// The user wrote a constructor initializer on a function that is
	// not a C++ constructor. Ignore the error for now, because we may
	// have more member initializers coming; we'll diagnose it just
	// once in ActOnMemInitializers.
	return true;
	}

	CXXRecordDecl *ClassDecl = Constructor->getParent();

	// C++ [class.base.init]p2:
	// Names in a mem-initializer-id are looked up in the scope of the
	// constructor's class and, if not found in that scope, are looked
	// up in the scope containing the constructor's definition.
	// [Note: if the constructor's class contains a member with the
	// same name as a direct or virtual base class of the class, a
	// mem-initializer-id naming the member or base class and composed
	// of a single identifier refers to the class member. A
	// mem-initializer-id for the hidden base class may be specified
	// using a qualified name. ]

	// Look for a member, first.
	if (ValueDecl *Member = tryLookupCtorInitMemberDecl(
	ClassDecl, SS, TemplateTypeTy, MemberOrBase)) {
	if (EllipsisLoc.isValid())
	Diag(EllipsisLoc, diag::err_pack_expansion_member_init)
	<< MemberOrBase
	<< SourceRange(IdLoc, Init->getSourceRange().getEnd());

	return BuildMemberInitializer(Member, Init, IdLoc);
	}
	// It didn't name a member, so see if it names a class.
	QualType BaseType;
	TypeSourceInfo *TInfo = nullptr;

	if (TemplateTypeTy) {
	BaseType = GetTypeFromParser(TemplateTypeTy, &TInfo);
	if (BaseType.isNull())
	return true;
	} else if (DS.getTypeSpecType() == TST_decltype) {
	BaseType = BuildDecltypeType(DS.getRepAsExpr(), DS.getTypeSpecTypeLoc());
	} else if (DS.getTypeSpecType() == TST_decltype_auto) {
	Diag(DS.getTypeSpecTypeLoc(), diag::err_decltype_auto_invalid);
	return true;
	} else {
	LookupResult R(*this, MemberOrBase, IdLoc, LookupOrdinaryName);
	LookupParsedName(R, S, &SS);

	TypeDecl *TyD = R.getAsSingle<TypeDecl>();
	if (!TyD) {
	if (R.isAmbiguous()) return true;

	// We don't want access-control diagnostics here.
	R.suppressDiagnostics();

	if (SS.isSet() && isDependentScopeSpecifier(SS)) {
	bool NotUnknownSpecialization = false;
	DeclContext *DC = computeDeclContext(SS, false);
	if (CXXRecordDecl *Record = dyn_cast_or_null<CXXRecordDecl>(DC))
	NotUnknownSpecialization = !Record->hasAnyDependentBases();

	if (!NotUnknownSpecialization) {
	// When the scope specifier can refer to a member of an unknown
	// specialization, we take it as a type name.
	BaseType = CheckTypenameType(ETK_None, SourceLocation(),
	SS.getWithLocInContext(Context),
	*MemberOrBase, IdLoc);
	if (BaseType.isNull())
	return true;

	TInfo = Context.CreateTypeSourceInfo(BaseType);
	DependentNameTypeLoc TL =
	TInfo->getTypeLoc().castAs<DependentNameTypeLoc>();
	if (!TL.isNull()) {
	TL.setNameLoc(IdLoc);
	TL.setElaboratedKeywordLoc(SourceLocation());
	TL.setQualifierLoc(SS.getWithLocInContext(Context));
	}

	R.clear();
	R.setLookupName(MemberOrBase);
	}
	}

	// If no results were found, try to correct typos.
	TypoCorrection Corr;
	MemInitializerValidatorCCC CCC(ClassDecl);
	if (R.empty() && BaseType.isNull() &&
	(Corr = CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), S, &SS,
	CCC, CTK_ErrorRecovery, ClassDecl))) {
	if (FieldDecl *Member = Corr.getCorrectionDeclAs<FieldDecl>()) {
	// We have found a non-static data member with a similar
	// name to what was typed; complain and initialize that
	// member.
	diagnoseTypo(Corr,
	PDiag(diag::err_mem_init_not_member_or_class_suggest)
	<< MemberOrBase << true);
	return BuildMemberInitializer(Member, Init, IdLoc);
	} else if (TypeDecl *Type = Corr.getCorrectionDeclAs<TypeDecl>()) {
	const CXXBaseSpecifier *DirectBaseSpec;
	const CXXBaseSpecifier *VirtualBaseSpec;
	if (FindBaseInitializer(*this, ClassDecl,
	Context.getTypeDeclType(Type),
	DirectBaseSpec, VirtualBaseSpec)) {
	// We have found a direct or virtual base class with a
	// similar name to what was typed; complain and initialize
	// that base class.
	diagnoseTypo(Corr,
	PDiag(diag::err_mem_init_not_member_or_class_suggest)
	<< MemberOrBase << false,
	PDiag() /Suppress note, we provide our own./);

	const CXXBaseSpecifier *BaseSpec = DirectBaseSpec ? DirectBaseSpec
	: VirtualBaseSpec;
	Diag(BaseSpec->getBeginLoc(), diag::note_base_class_specified_here)
	<< BaseSpec->getType() << BaseSpec->getSourceRange();

	TyD = Type;
	}
	}
	}

	if (!TyD && BaseType.isNull()) {
	Diag(IdLoc, diag::err_mem_init_not_member_or_class)
	<< MemberOrBase << SourceRange(IdLoc,Init->getSourceRange().getEnd());
	return true;
	}
	}

	if (BaseType.isNull()) {
	BaseType = Context.getTypeDeclType(TyD);
	MarkAnyDeclReferenced(TyD->getLocation(), TyD, /OdrUse=/false);
	if (SS.isSet()) {
	BaseType = Context.getElaboratedType(ETK_None, SS.getScopeRep(),
	BaseType);
	TInfo = Context.CreateTypeSourceInfo(BaseType);
	ElaboratedTypeLoc TL = TInfo->getTypeLoc().castAs<ElaboratedTypeLoc>();
	TL.getNamedTypeLoc().castAs<TypeSpecTypeLoc>().setNameLoc(IdLoc);
	TL.setElaboratedKeywordLoc(SourceLocation());
	TL.setQualifierLoc(SS.getWithLocInContext(Context));
	}
	}
	}

	if (!TInfo)
	TInfo = Context.getTrivialTypeSourceInfo(BaseType, IdLoc);

	return BuildBaseInitializer(BaseType, TInfo, Init, ClassDecl, EllipsisLoc);
	}

	MemInitResult
	Sema::BuildMemberInitializer(ValueDecl Member, Expr Init,
	SourceLocation IdLoc) {
	FieldDecl *DirectMember = dyn_cast<FieldDecl>(Member);
	IndirectFieldDecl *IndirectMember = dyn_cast<IndirectFieldDecl>(Member);
	assert((DirectMember \|\| IndirectMember) &&
	"Member must be a FieldDecl or IndirectFieldDecl");

	if (DiagnoseUnexpandedParameterPack(Init, UPPC_Initializer))
	return true;

	if (Member->isInvalidDecl())
	return true;

	MultiExprArg Args;
	if (ParenListExpr *ParenList = dyn_cast<ParenListExpr>(Init)) {
	Args = MultiExprArg(ParenList->getExprs(), ParenList->getNumExprs());
	} else if (InitListExpr *InitList = dyn_cast<InitListExpr>(Init)) {
	Args = MultiExprArg(InitList->getInits(), InitList->getNumInits());
	} else {
	// Template instantiation doesn't reconstruct ParenListExprs for us.
	Args = Init;
	}

	SourceRange InitRange = Init->getSourceRange();

	if (Member->getType()->isDependentType() \|\| Init->isTypeDependent()) {
	// Can't check initialization for a member of dependent type or when
	// any of the arguments are type-dependent expressions.
	DiscardCleanupsInEvaluationContext();
	} else {
	bool InitList = false;
	if (isa<InitListExpr>(Init)) {
	InitList = true;
	Args = Init;
	}

	// Initialize the member.
	InitializedEntity MemberEntity =
	DirectMember ? InitializedEntity::InitializeMember(DirectMember, nullptr)
	: InitializedEntity::InitializeMember(IndirectMember,
	nullptr);
	InitializationKind Kind =
	InitList ? InitializationKind::CreateDirectList(
	IdLoc, Init->getBeginLoc(), Init->getEndLoc())
	: InitializationKind::CreateDirect(IdLoc, InitRange.getBegin(),
	InitRange.getEnd());

	InitializationSequence InitSeq(*this, MemberEntity, Kind, Args);
	ExprResult MemberInit = InitSeq.Perform(*this, MemberEntity, Kind, Args,
	nullptr);
	if (MemberInit.isInvalid())
	return true;

	// C++11 [class.base.init]p7:
	// The initialization of each base and member constitutes a
	// full-expression.
	MemberInit = ActOnFinishFullExpr(MemberInit.get(), InitRange.getBegin(),
	/DiscardedValue/ false);
	if (MemberInit.isInvalid())
	return true;

	Init = MemberInit.get();
	}

	if (DirectMember) {
	return new (Context) CXXCtorInitializer(Context, DirectMember, IdLoc,
	InitRange.getBegin(), Init,
	InitRange.getEnd());
	} else {
	return new (Context) CXXCtorInitializer(Context, IndirectMember, IdLoc,
	InitRange.getBegin(), Init,
	InitRange.getEnd());
	}
	}

	MemInitResult
	Sema::BuildDelegatingInitializer(TypeSourceInfo TInfo, Expr Init,
	CXXRecordDecl *ClassDecl) {
	SourceLocation NameLoc = TInfo->getTypeLoc().getLocalSourceRange().getBegin();
	if (!LangOpts.CPlusPlus11)
	return Diag(NameLoc, diag::err_delegating_ctor)
	<< TInfo->getTypeLoc().getLocalSourceRange();
	Diag(NameLoc, diag::warn_cxx98_compat_delegating_ctor);

	bool InitList = true;
	MultiExprArg Args = Init;
	if (ParenListExpr *ParenList = dyn_cast<ParenListExpr>(Init)) {
	InitList = false;
	Args = MultiExprArg(ParenList->getExprs(), ParenList->getNumExprs());
	}

	SourceRange InitRange = Init->getSourceRange();
	// Initialize the object.
	InitializedEntity DelegationEntity = InitializedEntity::InitializeDelegation(
	QualType(ClassDecl->getTypeForDecl(), 0));
	InitializationKind Kind =
	InitList ? InitializationKind::CreateDirectList(
	NameLoc, Init->getBeginLoc(), Init->getEndLoc())
	: InitializationKind::CreateDirect(NameLoc, InitRange.getBegin(),
	InitRange.getEnd());
	InitializationSequence InitSeq(*this, DelegationEntity, Kind, Args);
	ExprResult DelegationInit = InitSeq.Perform(*this, DelegationEntity, Kind,
	Args, nullptr);
	if (DelegationInit.isInvalid())
	return true;

	assert(cast<CXXConstructExpr>(DelegationInit.get())->getConstructor() &&
	"Delegating constructor with no target?");

	// C++11 [class.base.init]p7:
	// The initialization of each base and member constitutes a
	// full-expression.
	DelegationInit = ActOnFinishFullExpr(
	DelegationInit.get(), InitRange.getBegin(), /DiscardedValue/ false);
	if (DelegationInit.isInvalid())
	return true;

	// If we are in a dependent context, template instantiation will
	// perform this type-checking again. Just save the arguments that we
	// received in a ParenListExpr.
	// FIXME: This isn't quite ideal, since our ASTs don't capture all
	// of the information that we have about the base
	// initializer. However, deconstructing the ASTs is a dicey process,
	// and this approach is far more likely to get the corner cases right.
	if (CurContext->isDependentContext())
	DelegationInit = Init;

	return new (Context) CXXCtorInitializer(Context, TInfo, InitRange.getBegin(),
	DelegationInit.getAs<Expr>(),
	InitRange.getEnd());
	}

	MemInitResult
	Sema::BuildBaseInitializer(QualType BaseType, TypeSourceInfo *BaseTInfo,
	Expr Init, CXXRecordDecl ClassDecl,
	SourceLocation EllipsisLoc) {
	SourceLocation BaseLoc
	= BaseTInfo->getTypeLoc().getLocalSourceRange().getBegin();

	if (!BaseType->isDependentType() && !BaseType->isRecordType())
	return Diag(BaseLoc, diag::err_base_init_does_not_name_class)
	<< BaseType << BaseTInfo->getTypeLoc().getLocalSourceRange();

	// C++ [class.base.init]p2:
	// [...] Unless the mem-initializer-id names a nonstatic data
	// member of the constructor's class or a direct or virtual base
	// of that class, the mem-initializer is ill-formed. A
	// mem-initializer-list can initialize a base class using any
	// name that denotes that base class type.
	bool Dependent = BaseType->isDependentType() \|\| Init->isTypeDependent();

	SourceRange InitRange = Init->getSourceRange();
	if (EllipsisLoc.isValid()) {
	// This is a pack expansion.
	if (!BaseType->containsUnexpandedParameterPack()) {
	Diag(EllipsisLoc, diag::err_pack_expansion_without_parameter_packs)
	<< SourceRange(BaseLoc, InitRange.getEnd());

	EllipsisLoc = SourceLocation();
	}
	} else {
	// Check for any unexpanded parameter packs.
	if (DiagnoseUnexpandedParameterPack(BaseLoc, BaseTInfo, UPPC_Initializer))
	return true;

	if (DiagnoseUnexpandedParameterPack(Init, UPPC_Initializer))
	return true;
	}

	// Check for direct and virtual base classes.
	const CXXBaseSpecifier *DirectBaseSpec = nullptr;
	const CXXBaseSpecifier *VirtualBaseSpec = nullptr;
	if (!Dependent) {
	if (Context.hasSameUnqualifiedType(QualType(ClassDecl->getTypeForDecl(),0),
	BaseType))
	return BuildDelegatingInitializer(BaseTInfo, Init, ClassDecl);

	FindBaseInitializer(*this, ClassDecl, BaseType, DirectBaseSpec,
	VirtualBaseSpec);

	// C++ [base.class.init]p2:
	// Unless the mem-initializer-id names a nonstatic data member of the
	// constructor's class or a direct or virtual base of that class, the
	// mem-initializer is ill-formed.
	if (!DirectBaseSpec && !VirtualBaseSpec) {
	// If the class has any dependent bases, then it's possible that
	// one of those types will resolve to the same type as
	// BaseType. Therefore, just treat this as a dependent base
	// class initialization. FIXME: Should we try to check the
	// initialization anyway? It seems odd.
	if (ClassDecl->hasAnyDependentBases())
	Dependent = true;
	else
	return Diag(BaseLoc, diag::err_not_direct_base_or_virtual)
	<< BaseType << Context.getTypeDeclType(ClassDecl)
	<< BaseTInfo->getTypeLoc().getLocalSourceRange();
	}
	}

	if (Dependent) {
	DiscardCleanupsInEvaluationContext();

	return new (Context) CXXCtorInitializer(Context, BaseTInfo,
	/IsVirtual=/false,
	InitRange.getBegin(), Init,
	InitRange.getEnd(), EllipsisLoc);
	}

	// C++ [base.class.init]p2:
	// If a mem-initializer-id is ambiguous because it designates both
	// a direct non-virtual base class and an inherited virtual base
	// class, the mem-initializer is ill-formed.
	if (DirectBaseSpec && VirtualBaseSpec)
	return Diag(BaseLoc, diag::err_base_init_direct_and_virtual)
	<< BaseType << BaseTInfo->getTypeLoc().getLocalSourceRange();

	const CXXBaseSpecifier *BaseSpec = DirectBaseSpec;
	if (!BaseSpec)
	BaseSpec = VirtualBaseSpec;

	// Initialize the base.
	bool InitList = true;
	MultiExprArg Args = Init;
	if (ParenListExpr *ParenList = dyn_cast<ParenListExpr>(Init)) {
	InitList = false;
	Args = MultiExprArg(ParenList->getExprs(), ParenList->getNumExprs());
	}

	InitializedEntity BaseEntity =
	InitializedEntity::InitializeBase(Context, BaseSpec, VirtualBaseSpec);
	InitializationKind Kind =
	InitList ? InitializationKind::CreateDirectList(BaseLoc)
	: InitializationKind::CreateDirect(BaseLoc, InitRange.getBegin(),
	InitRange.getEnd());
	InitializationSequence InitSeq(*this, BaseEntity, Kind, Args);
	ExprResult BaseInit = InitSeq.Perform(*this, BaseEntity, Kind, Args, nullptr);
	if (BaseInit.isInvalid())
	return true;

	// C++11 [class.base.init]p7:
	// The initialization of each base and member constitutes a
	// full-expression.
	BaseInit = ActOnFinishFullExpr(BaseInit.get(), InitRange.getBegin(),
	/DiscardedValue/ false);
	if (BaseInit.isInvalid())
	return true;

	// If we are in a dependent context, template instantiation will
	// perform this type-checking again. Just save the arguments that we
	// received in a ParenListExpr.
	// FIXME: This isn't quite ideal, since our ASTs don't capture all
	// of the information that we have about the base
	// initializer. However, deconstructing the ASTs is a dicey process,
	// and this approach is far more likely to get the corner cases right.
	if (CurContext->isDependentContext())
	BaseInit = Init;

	return new (Context) CXXCtorInitializer(Context, BaseTInfo,
	BaseSpec->isVirtual(),
	InitRange.getBegin(),
	BaseInit.getAs<Expr>(),
	InitRange.getEnd(), EllipsisLoc);
	}

	// Create a static_cast\<T&&>(expr).
	static Expr CastForMoving(Sema &SemaRef, Expr E, QualType T = QualType()) {
	if (T.isNull()) T = E->getType();
	QualType TargetType = SemaRef.BuildReferenceType(
	T, /SpelledAsLValue/false, SourceLocation(), DeclarationName());
	SourceLocation ExprLoc = E->getBeginLoc();
	TypeSourceInfo *TargetLoc = SemaRef.Context.getTrivialTypeSourceInfo(
	TargetType, ExprLoc);

	return SemaRef.BuildCXXNamedCast(ExprLoc, tok::kw_static_cast, TargetLoc, E,
	SourceRange(ExprLoc, ExprLoc),
	E->getSourceRange()).get();
	}

	/// ImplicitInitializerKind - How an implicit base or member initializer should
	/// initialize its base or member.
	enum ImplicitInitializerKind {
	IIK_Default,
	IIK_Copy,
	IIK_Move,
	IIK_Inherit
	};

	static bool
	BuildImplicitBaseInitializer(Sema &SemaRef, CXXConstructorDecl *Constructor,
	ImplicitInitializerKind ImplicitInitKind,
	CXXBaseSpecifier *BaseSpec,
	bool IsInheritedVirtualBase,
	CXXCtorInitializer *&CXXBaseInit) {
	InitializedEntity InitEntity
	= InitializedEntity::InitializeBase(SemaRef.Context, BaseSpec,
	IsInheritedVirtualBase);

	ExprResult BaseInit;

	switch (ImplicitInitKind) {
	case IIK_Inherit:
	case IIK_Default: {
	InitializationKind InitKind
	= InitializationKind::CreateDefault(Constructor->getLocation());
	InitializationSequence InitSeq(SemaRef, InitEntity, InitKind, None);
	BaseInit = InitSeq.Perform(SemaRef, InitEntity, InitKind, None);
	break;
	}

	case IIK_Move:
	case IIK_Copy: {
	bool Moving = ImplicitInitKind == IIK_Move;
	ParmVarDecl *Param = Constructor->getParamDecl(0);
	QualType ParamType = Param->getType().getNonReferenceType();

	Expr *CopyCtorArg =
	DeclRefExpr::Create(SemaRef.Context, NestedNameSpecifierLoc(),
	SourceLocation(), Param, false,
	Constructor->getLocation(), ParamType,
	VK_LValue, nullptr);

	SemaRef.MarkDeclRefReferenced(cast<DeclRefExpr>(CopyCtorArg));

	// Cast to the base class to avoid ambiguities.
	QualType ArgTy =
	SemaRef.Context.getQualifiedType(BaseSpec->getType().getUnqualifiedType(),
	ParamType.getQualifiers());

	if (Moving) {
	CopyCtorArg = CastForMoving(SemaRef, CopyCtorArg);
	}

	CXXCastPath BasePath;
	BasePath.push_back(BaseSpec);
	CopyCtorArg = SemaRef.ImpCastExprToType(CopyCtorArg, ArgTy,
	CK_UncheckedDerivedToBase,
	Moving ? VK_XValue : VK_LValue,
	&BasePath).get();

	InitializationKind InitKind
	= InitializationKind::CreateDirect(Constructor->getLocation(),
	SourceLocation(), SourceLocation());
	InitializationSequence InitSeq(SemaRef, InitEntity, InitKind, CopyCtorArg);
	BaseInit = InitSeq.Perform(SemaRef, InitEntity, InitKind, CopyCtorArg);
	break;
	}
	}

	BaseInit = SemaRef.MaybeCreateExprWithCleanups(BaseInit);
	if (BaseInit.isInvalid())
	return true;

	CXXBaseInit =
	new (SemaRef.Context) CXXCtorInitializer(SemaRef.Context,
	SemaRef.Context.getTrivialTypeSourceInfo(BaseSpec->getType(),
	SourceLocation()),
	BaseSpec->isVirtual(),
	SourceLocation(),
	BaseInit.getAs<Expr>(),
	SourceLocation(),
	SourceLocation());

	return false;
	}

	static bool RefersToRValueRef(Expr *MemRef) {
	ValueDecl *Referenced = cast<MemberExpr>(MemRef)->getMemberDecl();
	return Referenced->getType()->isRValueReferenceType();
	}

	static bool
	BuildImplicitMemberInitializer(Sema &SemaRef, CXXConstructorDecl *Constructor,
	ImplicitInitializerKind ImplicitInitKind,
	FieldDecl Field, IndirectFieldDecl Indirect,
	CXXCtorInitializer *&CXXMemberInit) {
	if (Field->isInvalidDecl())
	return true;

	SourceLocation Loc = Constructor->getLocation();

	if (ImplicitInitKind == IIK_Copy \|\| ImplicitInitKind == IIK_Move) {
	bool Moving = ImplicitInitKind == IIK_Move;
	ParmVarDecl *Param = Constructor->getParamDecl(0);
	QualType ParamType = Param->getType().getNonReferenceType();

	// Suppress copying zero-width bitfields.
	if (Field->isZeroLengthBitField(SemaRef.Context))
	return false;

	Expr *MemberExprBase =
	DeclRefExpr::Create(SemaRef.Context, NestedNameSpecifierLoc(),
	SourceLocation(), Param, false,
	Loc, ParamType, VK_LValue, nullptr);

	SemaRef.MarkDeclRefReferenced(cast<DeclRefExpr>(MemberExprBase));

	if (Moving) {
	MemberExprBase = CastForMoving(SemaRef, MemberExprBase);
	}

	// Build a reference to this field within the parameter.
	CXXScopeSpec SS;
	LookupResult MemberLookup(SemaRef, Field->getDeclName(), Loc,
	Sema::LookupMemberName);
	MemberLookup.addDecl(Indirect ? cast<ValueDecl>(Indirect)
	: cast<ValueDecl>(Field), AS_public);
	MemberLookup.resolveKind();
	ExprResult CtorArg
	= SemaRef.BuildMemberReferenceExpr(MemberExprBase,
	ParamType, Loc,
	/IsArrow=/false,
	SS,
	/TemplateKWLoc=/SourceLocation(),
	/FirstQualifierInScope=/nullptr,
	MemberLookup,
	/TemplateArgs=/nullptr,
	/S/nullptr);
	if (CtorArg.isInvalid())
	return true;

	// C++11 [class.copy]p15:
	// - if a member m has rvalue reference type T&&, it is direct-initialized
	// with static_cast<T&&>(x.m);
	if (RefersToRValueRef(CtorArg.get())) {
	CtorArg = CastForMoving(SemaRef, CtorArg.get());
	}

	InitializedEntity Entity =
	Indirect ? InitializedEntity::InitializeMember(Indirect, nullptr,
	/Implicit/ true)
	: InitializedEntity::InitializeMember(Field, nullptr,
	/Implicit/ true);

	// Direct-initialize to use the copy constructor.
	InitializationKind InitKind =
	InitializationKind::CreateDirect(Loc, SourceLocation(), SourceLocation());

	Expr *CtorArgE = CtorArg.getAs<Expr>();
	InitializationSequence InitSeq(SemaRef, Entity, InitKind, CtorArgE);
	ExprResult MemberInit =
	InitSeq.Perform(SemaRef, Entity, InitKind, MultiExprArg(&CtorArgE, 1));
	MemberInit = SemaRef.MaybeCreateExprWithCleanups(MemberInit);
	if (MemberInit.isInvalid())
	return true;

	if (Indirect)
	CXXMemberInit = new (SemaRef.Context) CXXCtorInitializer(
	SemaRef.Context, Indirect, Loc, Loc, MemberInit.getAs<Expr>(), Loc);
	else
	CXXMemberInit = new (SemaRef.Context) CXXCtorInitializer(
	SemaRef.Context, Field, Loc, Loc, MemberInit.getAs<Expr>(), Loc);
	return false;
	}

	assert((ImplicitInitKind == IIK_Default \|\| ImplicitInitKind == IIK_Inherit) &&
	"Unhandled implicit init kind!");

	QualType FieldBaseElementType =
	SemaRef.Context.getBaseElementType(Field->getType());

	if (FieldBaseElementType->isRecordType()) {
	InitializedEntity InitEntity =
	Indirect ? InitializedEntity::InitializeMember(Indirect, nullptr,
	/Implicit/ true)
	: InitializedEntity::InitializeMember(Field, nullptr,
	/Implicit/ true);
	InitializationKind InitKind =
	InitializationKind::CreateDefault(Loc);

	InitializationSequence InitSeq(SemaRef, InitEntity, InitKind, None);
	ExprResult MemberInit =
	InitSeq.Perform(SemaRef, InitEntity, InitKind, None);

	MemberInit = SemaRef.MaybeCreateExprWithCleanups(MemberInit);
	if (MemberInit.isInvalid())
	return true;

	if (Indirect)
	CXXMemberInit = new (SemaRef.Context) CXXCtorInitializer(SemaRef.Context,
	Indirect, Loc,
	Loc,
	MemberInit.get(),
	Loc);
	else
	CXXMemberInit = new (SemaRef.Context) CXXCtorInitializer(SemaRef.Context,
	Field, Loc, Loc,
	MemberInit.get(),
	Loc);
	return false;
	}

	if (!Field->getParent()->isUnion()) {
	if (FieldBaseElementType->isReferenceType()) {
	SemaRef.Diag(Constructor->getLocation(),
	diag::err_uninitialized_member_in_ctor)
	<< (int)Constructor->isImplicit()
	<< SemaRef.Context.getTagDeclType(Constructor->getParent())
	<< 0 << Field->getDeclName();
	SemaRef.Diag(Field->getLocation(), diag::note_declared_at);
	return true;
	}

	if (FieldBaseElementType.isConstQualified()) {
	SemaRef.Diag(Constructor->getLocation(),
	diag::err_uninitialized_member_in_ctor)
	<< (int)Constructor->isImplicit()
	<< SemaRef.Context.getTagDeclType(Constructor->getParent())
	<< 1 << Field->getDeclName();
	SemaRef.Diag(Field->getLocation(), diag::note_declared_at);
	return true;
	}
	}

	if (FieldBaseElementType.hasNonTrivialObjCLifetime()) {
	// ARC and Weak:
	// Default-initialize Objective-C pointers to NULL.
	CXXMemberInit
	= new (SemaRef.Context) CXXCtorInitializer(SemaRef.Context, Field,
	Loc, Loc,
	new (SemaRef.Context) ImplicitValueInitExpr(Field->getType()),
	Loc);
	return false;
	}

	// Nothing to initialize.
	CXXMemberInit = nullptr;
	return false;
	}

	namespace {
	struct BaseAndFieldInfo {
	Sema &S;
	CXXConstructorDecl *Ctor;
	bool AnyErrorsInInits;
	ImplicitInitializerKind IIK;
	llvm::DenseMap<const void , CXXCtorInitializer> AllBaseFields;
	SmallVector<CXXCtorInitializer*, 8> AllToInit;
	llvm::DenseMap<TagDecl, FieldDecl> ActiveUnionMember;

	BaseAndFieldInfo(Sema &S, CXXConstructorDecl *Ctor, bool ErrorsInInits)
	: S(S), Ctor(Ctor), AnyErrorsInInits(ErrorsInInits) {
	bool Generated = Ctor->isImplicit() \|\| Ctor->isDefaulted();
	if (Ctor->getInheritedConstructor())
	IIK = IIK_Inherit;
	else if (Generated && Ctor->isCopyConstructor())
	IIK = IIK_Copy;
	else if (Generated && Ctor->isMoveConstructor())
	IIK = IIK_Move;
	else
	IIK = IIK_Default;
	}

	bool isImplicitCopyOrMove() const {
	switch (IIK) {
	case IIK_Copy:
	case IIK_Move:
	return true;

	case IIK_Default:
	case IIK_Inherit:
	return false;
	}

	llvm_unreachable("Invalid ImplicitInitializerKind!");
	}

	bool addFieldInitializer(CXXCtorInitializer *Init) {
	AllToInit.push_back(Init);

	// Check whether this initializer makes the field "used".
	if (Init->getInit()->HasSideEffects(S.Context))
	S.UnusedPrivateFields.remove(Init->getAnyMember());

	return false;
	}

	bool isInactiveUnionMember(FieldDecl *Field) {
	RecordDecl *Record = Field->getParent();
	if (!Record->isUnion())
	return false;

	if (FieldDecl *Active =
	ActiveUnionMember.lookup(Record->getCanonicalDecl()))
	return Active != Field->getCanonicalDecl();

	// In an implicit copy or move constructor, ignore any in-class initializer.
	if (isImplicitCopyOrMove())
	return true;

	// If there's no explicit initialization, the field is active only if it
	// has an in-class initializer...
	if (Field->hasInClassInitializer())
	return false;
	// ... or it's an anonymous struct or union whose class has an in-class
	// initializer.
	if (!Field->isAnonymousStructOrUnion())
	return true;
	CXXRecordDecl *FieldRD = Field->getType()->getAsCXXRecordDecl();
	return !FieldRD->hasInClassInitializer();
	}

	/// Determine whether the given field is, or is within, a union member
	/// that is inactive (because there was an initializer given for a different
	/// member of the union, or because the union was not initialized at all).
	bool isWithinInactiveUnionMember(FieldDecl *Field,
	IndirectFieldDecl *Indirect) {
	if (!Indirect)
	return isInactiveUnionMember(Field);

	for (auto *C : Indirect->chain()) {
	FieldDecl *Field = dyn_cast<FieldDecl>(C);
	if (Field && isInactiveUnionMember(Field))
	return true;
	}
	return false;
	}
	};
	}

	/// Determine whether the given type is an incomplete or zero-lenfgth
	/// array type.
	static bool isIncompleteOrZeroLengthArrayType(ASTContext &Context, QualType T) {
	if (T->isIncompleteArrayType())
	return true;

	while (const ConstantArrayType *ArrayT = Context.getAsConstantArrayType(T)) {
	if (!ArrayT->getSize())
	return true;

	T = ArrayT->getElementType();
	}

	return false;
	}

	static bool CollectFieldInitializer(Sema &SemaRef, BaseAndFieldInfo &Info,
	FieldDecl *Field,
	IndirectFieldDecl *Indirect = nullptr) {
	if (Field->isInvalidDecl())
	return false;

	// Overwhelmingly common case: we have a direct initializer for this field.
	if (CXXCtorInitializer *Init =
	Info.AllBaseFields.lookup(Field->getCanonicalDecl()))
	return Info.addFieldInitializer(Init);

	// C++11 [class.base.init]p8:
	// if the entity is a non-static data member that has a
	// brace-or-equal-initializer and either
	// -- the constructor's class is a union and no other variant member of that
	// union is designated by a mem-initializer-id or
	// -- the constructor's class is not a union, and, if the entity is a member
	// of an anonymous union, no other member of that union is designated by
	// a mem-initializer-id,
	// the entity is initialized as specified in [dcl.init].
	//
	// We also apply the same rules to handle anonymous structs within anonymous
	// unions.
	if (Info.isWithinInactiveUnionMember(Field, Indirect))
	return false;

	if (Field->hasInClassInitializer() && !Info.isImplicitCopyOrMove()) {
	ExprResult DIE =
	SemaRef.BuildCXXDefaultInitExpr(Info.Ctor->getLocation(), Field);
	if (DIE.isInvalid())
	return true;

	auto Entity = InitializedEntity::InitializeMember(Field, nullptr, true);
	SemaRef.checkInitializerLifetime(Entity, DIE.get());

	CXXCtorInitializer *Init;
	if (Indirect)
	Init = new (SemaRef.Context)
	CXXCtorInitializer(SemaRef.Context, Indirect, SourceLocation(),
	SourceLocation(), DIE.get(), SourceLocation());
	else
	Init = new (SemaRef.Context)
	CXXCtorInitializer(SemaRef.Context, Field, SourceLocation(),
	SourceLocation(), DIE.get(), SourceLocation());
	return Info.addFieldInitializer(Init);
	}

	// Don't initialize incomplete or zero-length arrays.
	if (isIncompleteOrZeroLengthArrayType(SemaRef.Context, Field->getType()))
	return false;

	// Don't try to build an implicit initializer if there were semantic
	// errors in any of the initializers (and therefore we might be
	// missing some that the user actually wrote).
	if (Info.AnyErrorsInInits)
	return false;

	CXXCtorInitializer *Init = nullptr;
	if (BuildImplicitMemberInitializer(Info.S, Info.Ctor, Info.IIK, Field,
	Indirect, Init))
	return true;

	if (!Init)
	return false;

	return Info.addFieldInitializer(Init);
	}

	bool
	Sema::SetDelegatingInitializer(CXXConstructorDecl *Constructor,
	CXXCtorInitializer *Initializer) {
	assert(Initializer->isDelegatingInitializer());
	Constructor->setNumCtorInitializers(1);
	CXXCtorInitializer **initializer =
	new (Context) CXXCtorInitializer*[1];
	memcpy(initializer, &Initializer, sizeof (CXXCtorInitializer*));
	Constructor->setCtorInitializers(initializer);

	if (CXXDestructorDecl *Dtor = LookupDestructor(Constructor->getParent())) {
	MarkFunctionReferenced(Initializer->getSourceLocation(), Dtor);
	DiagnoseUseOfDecl(Dtor, Initializer->getSourceLocation());
	}

	DelegatingCtorDecls.push_back(Constructor);

	DiagnoseUninitializedFields(*this, Constructor);

	return false;
	}

	bool Sema::SetCtorInitializers(CXXConstructorDecl *Constructor, bool AnyErrors,
	ArrayRef<CXXCtorInitializer *> Initializers) {
	if (Constructor->isDependentContext()) {
	// Just store the initializers as written, they will be checked during
	// instantiation.
	if (!Initializers.empty()) {
	Constructor->setNumCtorInitializers(Initializers.size());
	CXXCtorInitializer **baseOrMemberInitializers =
	new (Context) CXXCtorInitializer*[Initializers.size()];
	memcpy(baseOrMemberInitializers, Initializers.data(),
	Initializers.size() * sizeof(CXXCtorInitializer*));
	Constructor->setCtorInitializers(baseOrMemberInitializers);
	}

	// Let template instantiation know whether we had errors.
	if (AnyErrors)
	Constructor->setInvalidDecl();

	return false;
	}

	BaseAndFieldInfo Info(*this, Constructor, AnyErrors);

	// We need to build the initializer AST according to order of construction
	// and not what user specified in the Initializers list.
	CXXRecordDecl *ClassDecl = Constructor->getParent()->getDefinition();
	if (!ClassDecl)
	return true;

	bool HadError = false;

	for (unsigned i = 0; i < Initializers.size(); i++) {
	CXXCtorInitializer *Member = Initializers[i];

	if (Member->isBaseInitializer())
	Info.AllBaseFields[Member->getBaseClass()->getAs<RecordType>()] = Member;
	else {
	Info.AllBaseFields[Member->getAnyMember()->getCanonicalDecl()] = Member;

	if (IndirectFieldDecl *F = Member->getIndirectMember()) {
	for (auto *C : F->chain()) {
	FieldDecl *FD = dyn_cast<FieldDecl>(C);
	if (FD && FD->getParent()->isUnion())
	Info.ActiveUnionMember.insert(std::make_pair(
	FD->getParent()->getCanonicalDecl(), FD->getCanonicalDecl()));
	}
	} else if (FieldDecl *FD = Member->getMember()) {
	if (FD->getParent()->isUnion())
	Info.ActiveUnionMember.insert(std::make_pair(
	FD->getParent()->getCanonicalDecl(), FD->getCanonicalDecl()));
	}
	}
	}

	// Keep track of the direct virtual bases.
	llvm::SmallPtrSet<CXXBaseSpecifier *, 16> DirectVBases;
	for (auto &I : ClassDecl->bases()) {
	if (I.isVirtual())
	DirectVBases.insert(&I);
	}

	// Push virtual bases before others.
	for (auto &VBase : ClassDecl->vbases()) {
	if (CXXCtorInitializer *Value
	= Info.AllBaseFields.lookup(VBase.getType()->getAs<RecordType>())) {
	// [class.base.init]p7, per DR257:
	// A mem-initializer where the mem-initializer-id names a virtual base
	// class is ignored during execution of a constructor of any class that
	// is not the most derived class.
	if (ClassDecl->isAbstract()) {
	// FIXME: Provide a fixit to remove the base specifier. This requires
	// tracking the location of the associated comma for a base specifier.
	Diag(Value->getSourceLocation(), diag::warn_abstract_vbase_init_ignored)
	<< VBase.getType() << ClassDecl;
	DiagnoseAbstractType(ClassDecl);
	}

	Info.AllToInit.push_back(Value);
	} else if (!AnyErrors && !ClassDecl->isAbstract()) {
	// [class.base.init]p8, per DR257:
	// If a given [...] base class is not named by a mem-initializer-id
	// [...] and the entity is not a virtual base class of an abstract
	// class, then [...] the entity is default-initialized.
	bool IsInheritedVirtualBase = !DirectVBases.count(&VBase);
	CXXCtorInitializer *CXXBaseInit;
	if (BuildImplicitBaseInitializer(*this, Constructor, Info.IIK,
	&VBase, IsInheritedVirtualBase,
	CXXBaseInit)) {
	HadError = true;
	continue;
	}

	Info.AllToInit.push_back(CXXBaseInit);
	}
	}

	// Non-virtual bases.
	for (auto &Base : ClassDecl->bases()) {
	// Virtuals are in the virtual base list and already constructed.
	if (Base.isVirtual())
	continue;

	if (CXXCtorInitializer *Value
	= Info.AllBaseFields.lookup(Base.getType()->getAs<RecordType>())) {
	Info.AllToInit.push_back(Value);
	} else if (!AnyErrors) {
	CXXCtorInitializer *CXXBaseInit;
	if (BuildImplicitBaseInitializer(*this, Constructor, Info.IIK,
	&Base, /IsInheritedVirtualBase=/false,
	CXXBaseInit)) {
	HadError = true;
	continue;
	}

	Info.AllToInit.push_back(CXXBaseInit);
	}
	}

	// Fields.
	for (auto *Mem : ClassDecl->decls()) {
	if (auto *F = dyn_cast<FieldDecl>(Mem)) {
	// C++ [class.bit]p2:
	// A declaration for a bit-field that omits the identifier declares an
	// unnamed bit-field. Unnamed bit-fields are not members and cannot be
	// initialized.
	if (F->isUnnamedBitfield())
	continue;

	// If we're not generating the implicit copy/move constructor, then we'll
	// handle anonymous struct/union fields based on their individual
	// indirect fields.
	if (F->isAnonymousStructOrUnion() && !Info.isImplicitCopyOrMove())
	continue;

	if (CollectFieldInitializer(*this, Info, F))
	HadError = true;
	continue;
	}

	// Beyond this point, we only consider default initialization.
	if (Info.isImplicitCopyOrMove())
	continue;

	if (auto *F = dyn_cast<IndirectFieldDecl>(Mem)) {
	if (F->getType()->isIncompleteArrayType()) {
	assert(ClassDecl->hasFlexibleArrayMember() &&
	"Incomplete array type is not valid");
	continue;
	}

	// Initialize each field of an anonymous struct individually.
	if (CollectFieldInitializer(*this, Info, F->getAnonField(), F))
	HadError = true;

	continue;
	}
	}

	unsigned NumInitializers = Info.AllToInit.size();
	if (NumInitializers > 0) {
	Constructor->setNumCtorInitializers(NumInitializers);
	CXXCtorInitializer **baseOrMemberInitializers =
	new (Context) CXXCtorInitializer*[NumInitializers];
	memcpy(baseOrMemberInitializers, Info.AllToInit.data(),
	NumInitializers * sizeof(CXXCtorInitializer*));
	Constructor->setCtorInitializers(baseOrMemberInitializers);

	// Constructors implicitly reference the base and member
	// destructors.
	MarkBaseAndMemberDestructorsReferenced(Constructor->getLocation(),
	Constructor->getParent());
	}

	return HadError;
	}

	static void PopulateKeysForFields(FieldDecl Field, SmallVectorImpl<const void> &IdealInits) {
	if (const RecordType *RT = Field->getType()->getAs<RecordType>()) {
	const RecordDecl *RD = RT->getDecl();
	if (RD->isAnonymousStructOrUnion()) {
	for (auto *Field : RD->fields())
	PopulateKeysForFields(Field, IdealInits);
	return;
	}
	}
	IdealInits.push_back(Field->getCanonicalDecl());
	}

	static const void *GetKeyForBase(ASTContext &Context, QualType BaseType) {
	return Context.getCanonicalType(BaseType).getTypePtr();
	}

	static const void *GetKeyForMember(ASTContext &Context,
	CXXCtorInitializer *Member) {
	if (!Member->isAnyMemberInitializer())
	return GetKeyForBase(Context, QualType(Member->getBaseClass(), 0));

	return Member->getAnyMember()->getCanonicalDecl();
	}

	static void DiagnoseBaseOrMemInitializerOrder(
	Sema &SemaRef, const CXXConstructorDecl *Constructor,
	ArrayRef<CXXCtorInitializer *> Inits) {
	if (Constructor->getDeclContext()->isDependentContext())
	return;

	// Don't check initializers order unless the warning is enabled at the
	// location of at least one initializer.
	bool ShouldCheckOrder = false;
	for (unsigned InitIndex = 0; InitIndex != Inits.size(); ++InitIndex) {
	CXXCtorInitializer *Init = Inits[InitIndex];
	if (!SemaRef.Diags.isIgnored(diag::warn_initializer_out_of_order,
	Init->getSourceLocation())) {
	ShouldCheckOrder = true;
	break;
	}
	}
	if (!ShouldCheckOrder)
	return;

	// Build the list of bases and members in the order that they'll
	// actually be initialized. The explicit initializers should be in
	// this same order but may be missing things.
	SmallVector<const void*, 32> IdealInitKeys;

	const CXXRecordDecl *ClassDecl = Constructor->getParent();

	// 1. Virtual bases.
	for (const auto &VBase : ClassDecl->vbases())
	IdealInitKeys.push_back(GetKeyForBase(SemaRef.Context, VBase.getType()));

	// 2. Non-virtual bases.
	for (const auto &Base : ClassDecl->bases()) {
	if (Base.isVirtual())
	continue;
	IdealInitKeys.push_back(GetKeyForBase(SemaRef.Context, Base.getType()));
	}

	// 3. Direct fields.
	for (auto *Field : ClassDecl->fields()) {
	if (Field->isUnnamedBitfield())
	continue;

	PopulateKeysForFields(Field, IdealInitKeys);
	}

	unsigned NumIdealInits = IdealInitKeys.size();
	unsigned IdealIndex = 0;

	CXXCtorInitializer *PrevInit = nullptr;
	for (unsigned InitIndex = 0; InitIndex != Inits.size(); ++InitIndex) {
	CXXCtorInitializer *Init = Inits[InitIndex];
	const void *InitKey = GetKeyForMember(SemaRef.Context, Init);

	// Scan forward to try to find this initializer in the idealized
	// initializers list.
	for (; IdealIndex != NumIdealInits; ++IdealIndex)
	if (InitKey == IdealInitKeys[IdealIndex])
	break;

	// If we didn't find this initializer, it must be because we
	// scanned past it on a previous iteration. That can only
	// happen if we're out of order; emit a warning.
	if (IdealIndex == NumIdealInits && PrevInit) {
	Sema::SemaDiagnosticBuilder D =
	SemaRef.Diag(PrevInit->getSourceLocation(),
	diag::warn_initializer_out_of_order);

	if (PrevInit->isAnyMemberInitializer())
	D << 0 << PrevInit->getAnyMember()->getDeclName();
	else
	D << 1 << PrevInit->getTypeSourceInfo()->getType();

	if (Init->isAnyMemberInitializer())
	D << 0 << Init->getAnyMember()->getDeclName();
	else
	D << 1 << Init->getTypeSourceInfo()->getType();

	// Move back to the initializer's location in the ideal list.
	for (IdealIndex = 0; IdealIndex != NumIdealInits; ++IdealIndex)
	if (InitKey == IdealInitKeys[IdealIndex])
	break;

	assert(IdealIndex < NumIdealInits &&
	"initializer not found in initializer list");
	}

	PrevInit = Init;
	}
	}

	namespace {
	bool CheckRedundantInit(Sema &S,
	CXXCtorInitializer *Init,
	CXXCtorInitializer *&PrevInit) {
	if (!PrevInit) {
	PrevInit = Init;
	return false;
	}

	if (FieldDecl *Field = Init->getAnyMember())
	S.Diag(Init->getSourceLocation(),
	diag::err_multiple_mem_initialization)
	<< Field->getDeclName()
	<< Init->getSourceRange();
	else {
	const Type *BaseClass = Init->getBaseClass();
	assert(BaseClass && "neither field nor base");
	S.Diag(Init->getSourceLocation(),
	diag::err_multiple_base_initialization)
	<< QualType(BaseClass, 0)
	<< Init->getSourceRange();
	}
	S.Diag(PrevInit->getSourceLocation(), diag::note_previous_initializer)
	<< 0 << PrevInit->getSourceRange();

	return true;
	}

	typedef std::pair<NamedDecl , CXXCtorInitializer > UnionEntry;
	typedef llvm::DenseMap<RecordDecl*, UnionEntry> RedundantUnionMap;

	bool CheckRedundantUnionInit(Sema &S,
	CXXCtorInitializer *Init,
	RedundantUnionMap &Unions) {
	FieldDecl *Field = Init->getAnyMember();
	RecordDecl *Parent = Field->getParent();
	NamedDecl *Child = Field;

	while (Parent->isAnonymousStructOrUnion() \|\| Parent->isUnion()) {
	if (Parent->isUnion()) {
	UnionEntry &En = Unions[Parent];
	if (En.first && En.first != Child) {
	S.Diag(Init->getSourceLocation(),
	diag::err_multiple_mem_union_initialization)
	<< Field->getDeclName()
	<< Init->getSourceRange();
	S.Diag(En.second->getSourceLocation(), diag::note_previous_initializer)
	<< 0 << En.second->getSourceRange();
	return true;
	}
	if (!En.first) {
	En.first = Child;
	En.second = Init;
	}
	if (!Parent->isAnonymousStructOrUnion())
	return false;
	}

	Child = Parent;
	Parent = cast<RecordDecl>(Parent->getDeclContext());
	}

	return false;
	}
	}

	/// ActOnMemInitializers - Handle the member initializers for a constructor.
	void Sema::ActOnMemInitializers(Decl *ConstructorDecl,
	SourceLocation ColonLoc,
	ArrayRef<CXXCtorInitializer*> MemInits,
	bool AnyErrors) {
	if (!ConstructorDecl)
	return;

	AdjustDeclIfTemplate(ConstructorDecl);

	CXXConstructorDecl *Constructor
	= dyn_cast<CXXConstructorDecl>(ConstructorDecl);

	if (!Constructor) {
	Diag(ColonLoc, diag::err_only_constructors_take_base_inits);
	return;
	}

	// Mapping for the duplicate initializers check.
	// For member initializers, this is keyed with a FieldDecl*.
	// For base initializers, this is keyed with a Type*.
	llvm::DenseMap<const void , CXXCtorInitializer > Members;

	// Mapping for the inconsistent anonymous-union initializers check.
	RedundantUnionMap MemberUnions;

	bool HadError = false;
	for (unsigned i = 0; i < MemInits.size(); i++) {
	CXXCtorInitializer *Init = MemInits[i];

	// Set the source order index.
	Init->setSourceOrder(i);

	if (Init->isAnyMemberInitializer()) {
	const void *Key = GetKeyForMember(Context, Init);
	if (CheckRedundantInit(*this, Init, Members[Key]) \|\|
	CheckRedundantUnionInit(*this, Init, MemberUnions))
	HadError = true;
	} else if (Init->isBaseInitializer()) {
	const void *Key = GetKeyForMember(Context, Init);
	if (CheckRedundantInit(*this, Init, Members[Key]))
	HadError = true;
	} else {
	assert(Init->isDelegatingInitializer());
	// This must be the only initializer
	if (MemInits.size() != 1) {
	Diag(Init->getSourceLocation(),
	diag::err_delegating_initializer_alone)
	<< Init->getSourceRange() << MemInits[i ? 0 : 1]->getSourceRange();
	// We will treat this as being the only initializer.
	}
	SetDelegatingInitializer(Constructor, MemInits[i]);
	// Return immediately as the initializer is set.
	return;
	}
	}

	if (HadError)
	return;

	DiagnoseBaseOrMemInitializerOrder(*this, Constructor, MemInits);

	SetCtorInitializers(Constructor, AnyErrors, MemInits);

	DiagnoseUninitializedFields(*this, Constructor);
	}

	void
	Sema::MarkBaseAndMemberDestructorsReferenced(SourceLocation Location,
	CXXRecordDecl *ClassDecl) {
	// Ignore dependent contexts. Also ignore unions, since their members never
	// have destructors implicitly called.
	if (ClassDecl->isDependentContext() \|\| ClassDecl->isUnion())
	return;

	// FIXME: all the access-control diagnostics are positioned on the
	// field/base declaration. That's probably good; that said, the
	// user might reasonably want to know why the destructor is being
	// emitted, and we currently don't say.

	// Non-static data members.
	for (auto *Field : ClassDecl->fields()) {
	if (Field->isInvalidDecl())
	continue;

	// Don't destroy incomplete or zero-length arrays.
	if (isIncompleteOrZeroLengthArrayType(Context, Field->getType()))
	continue;

	QualType FieldType = Context.getBaseElementType(Field->getType());

	const RecordType* RT = FieldType->getAs<RecordType>();
	if (!RT)
	continue;

	CXXRecordDecl *FieldClassDecl = cast<CXXRecordDecl>(RT->getDecl());
	if (FieldClassDecl->isInvalidDecl())
	continue;
	if (FieldClassDecl->hasIrrelevantDestructor())
	continue;
	// The destructor for an implicit anonymous union member is never invoked.
	if (FieldClassDecl->isUnion() && FieldClassDecl->isAnonymousStructOrUnion())
	continue;

	CXXDestructorDecl *Dtor = LookupDestructor(FieldClassDecl);
	assert(Dtor && "No dtor found for FieldClassDecl!");
	CheckDestructorAccess(Field->getLocation(), Dtor,
	PDiag(diag::err_access_dtor_field)
	<< Field->getDeclName()
	<< FieldType);

	MarkFunctionReferenced(Location, Dtor);
	DiagnoseUseOfDecl(Dtor, Location);
	}

	// We only potentially invoke the destructors of potentially constructed
	// subobjects.
	bool VisitVirtualBases = !ClassDecl->isAbstract();

	llvm::SmallPtrSet<const RecordType *, 8> DirectVirtualBases;

	// Bases.
	for (const auto &Base : ClassDecl->bases()) {
	// Bases are always records in a well-formed non-dependent class.
	const RecordType *RT = Base.getType()->getAs<RecordType>();

	// Remember direct virtual bases.
	if (Base.isVirtual()) {
	if (!VisitVirtualBases)
	continue;
	DirectVirtualBases.insert(RT);
	}

	CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(RT->getDecl());
	// If our base class is invalid, we probably can't get its dtor anyway.
	if (BaseClassDecl->isInvalidDecl())
	continue;
	if (BaseClassDecl->hasIrrelevantDestructor())
	continue;

	CXXDestructorDecl *Dtor = LookupDestructor(BaseClassDecl);
	assert(Dtor && "No dtor found for BaseClassDecl!");

	// FIXME: caret should be on the start of the class name
	CheckDestructorAccess(Base.getBeginLoc(), Dtor,
	PDiag(diag::err_access_dtor_base)
	<< Base.getType() << Base.getSourceRange(),
	Context.getTypeDeclType(ClassDecl));

	MarkFunctionReferenced(Location, Dtor);
	DiagnoseUseOfDecl(Dtor, Location);
	}

	if (!VisitVirtualBases)
	return;

	// Virtual bases.
	for (const auto &VBase : ClassDecl->vbases()) {
	// Bases are always records in a well-formed non-dependent class.
	const RecordType *RT = VBase.getType()->castAs<RecordType>();

	// Ignore direct virtual bases.
	if (DirectVirtualBases.count(RT))
	continue;

	CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(RT->getDecl());
	// If our base class is invalid, we probably can't get its dtor anyway.
	if (BaseClassDecl->isInvalidDecl())
	continue;
	if (BaseClassDecl->hasIrrelevantDestructor())
	continue;

	CXXDestructorDecl *Dtor = LookupDestructor(BaseClassDecl);
	assert(Dtor && "No dtor found for BaseClassDecl!");
	if (CheckDestructorAccess(
	ClassDecl->getLocation(), Dtor,
	PDiag(diag::err_access_dtor_vbase)
	<< Context.getTypeDeclType(ClassDecl) << VBase.getType(),
	Context.getTypeDeclType(ClassDecl)) ==
	AR_accessible) {
	CheckDerivedToBaseConversion(
	Context.getTypeDeclType(ClassDecl), VBase.getType(),
	diag::err_access_dtor_vbase, 0, ClassDecl->getLocation(),
	SourceRange(), DeclarationName(), nullptr);
	}

	MarkFunctionReferenced(Location, Dtor);
	DiagnoseUseOfDecl(Dtor, Location);
	}
	}

	void Sema::ActOnDefaultCtorInitializers(Decl *CDtorDecl) {
	if (!CDtorDecl)
	return;

	if (CXXConstructorDecl *Constructor
	= dyn_cast<CXXConstructorDecl>(CDtorDecl)) {
	SetCtorInitializers(Constructor, /AnyErrors=/false);
	DiagnoseUninitializedFields(*this, Constructor);
	}
	}

	bool Sema::isAbstractType(SourceLocation Loc, QualType T) {
	if (!getLangOpts().CPlusPlus)
	return false;

	const auto *RD = Context.getBaseElementType(T)->getAsCXXRecordDecl();
	if (!RD)
	return false;

	// FIXME: Per [temp.inst]p1, we are supposed to trigger instantiation of a
	// class template specialization here, but doing so breaks a lot of code.

	// We can't answer whether something is abstract until it has a
	// definition. If it's currently being defined, we'll walk back
	// over all the declarations when we have a full definition.
	const CXXRecordDecl *Def = RD->getDefinition();
	if (!Def \|\| Def->isBeingDefined())
	return false;

	return RD->isAbstract();
	}

	bool Sema::RequireNonAbstractType(SourceLocation Loc, QualType T,
	TypeDiagnoser &Diagnoser) {
	if (!isAbstractType(Loc, T))
	return false;

	T = Context.getBaseElementType(T);
	Diagnoser.diagnose(*this, Loc, T);
	DiagnoseAbstractType(T->getAsCXXRecordDecl());
	return true;
	}

	void Sema::DiagnoseAbstractType(const CXXRecordDecl *RD) {
	// Check if we've already emitted the list of pure virtual functions
	// for this class.
	if (PureVirtualClassDiagSet && PureVirtualClassDiagSet->count(RD))
	return;

	// If the diagnostic is suppressed, don't emit the notes. We're only
	// going to emit them once, so try to attach them to a diagnostic we're
	// actually going to show.
	if (Diags.isLastDiagnosticIgnored())
	return;

	CXXFinalOverriderMap FinalOverriders;
	RD->getFinalOverriders(FinalOverriders);

	// Keep a set of seen pure methods so we won't diagnose the same method
	// more than once.
	llvm::SmallPtrSet<const CXXMethodDecl *, 8> SeenPureMethods;

	for (CXXFinalOverriderMap::iterator M = FinalOverriders.begin(),
	MEnd = FinalOverriders.end();
	M != MEnd;
	++M) {
	for (OverridingMethods::iterator SO = M->second.begin(),
	SOEnd = M->second.end();
	SO != SOEnd; ++SO) {
	// C++ [class.abstract]p4:
	// A class is abstract if it contains or inherits at least one
	// pure virtual function for which the final overrider is pure
	// virtual.

	//
	if (SO->second.size() != 1)
	continue;

	if (!SO->second.front().Method->isPure())
	continue;

	if (!SeenPureMethods.insert(SO->second.front().Method).second)
	continue;

	Diag(SO->second.front().Method->getLocation(),
	diag::note_pure_virtual_function)
	<< SO->second.front().Method->getDeclName() << RD->getDeclName();
	}
	}

	if (!PureVirtualClassDiagSet)
	PureVirtualClassDiagSet.reset(new RecordDeclSetTy);
	PureVirtualClassDiagSet->insert(RD);
	}

	namespace {
	struct AbstractUsageInfo {
	Sema &S;
	CXXRecordDecl *Record;
	CanQualType AbstractType;
	bool Invalid;

	AbstractUsageInfo(Sema &S, CXXRecordDecl *Record)
	: S(S), Record(Record),
	AbstractType(S.Context.getCanonicalType(
	S.Context.getTypeDeclType(Record))),
	Invalid(false) {}

	void DiagnoseAbstractType() {
	if (Invalid) return;
	S.DiagnoseAbstractType(Record);
	Invalid = true;
	}

	void CheckType(const NamedDecl *D, TypeLoc TL, Sema::AbstractDiagSelID Sel);
	};

	struct CheckAbstractUsage {
	AbstractUsageInfo &Info;
	const NamedDecl *Ctx;

	CheckAbstractUsage(AbstractUsageInfo &Info, const NamedDecl *Ctx)
	: Info(Info), Ctx(Ctx) {}

	void Visit(TypeLoc TL, Sema::AbstractDiagSelID Sel) {
	switch (TL.getTypeLocClass()) {
	#define ABSTRACT_TYPELOC(CLASS, PARENT)
	#define TYPELOC(CLASS, PARENT) \
	case TypeLoc::CLASS: Check(TL.castAs<CLASS##TypeLoc>(), Sel); break;
	#include "clang/AST/TypeLocNodes.def"
	}
	}

	void Check(FunctionProtoTypeLoc TL, Sema::AbstractDiagSelID Sel) {
	Visit(TL.getReturnLoc(), Sema::AbstractReturnType);
	for (unsigned I = 0, E = TL.getNumParams(); I != E; ++I) {
	if (!TL.getParam(I))
	continue;

	TypeSourceInfo *TSI = TL.getParam(I)->getTypeSourceInfo();
	if (TSI) Visit(TSI->getTypeLoc(), Sema::AbstractParamType);
	}
	}

	void Check(ArrayTypeLoc TL, Sema::AbstractDiagSelID Sel) {
	Visit(TL.getElementLoc(), Sema::AbstractArrayType);
	}

	void Check(TemplateSpecializationTypeLoc TL, Sema::AbstractDiagSelID Sel) {
	// Visit the type parameters from a permissive context.
	for (unsigned I = 0, E = TL.getNumArgs(); I != E; ++I) {
	TemplateArgumentLoc TAL = TL.getArgLoc(I);
	if (TAL.getArgument().getKind() == TemplateArgument::Type)
	if (TypeSourceInfo *TSI = TAL.getTypeSourceInfo())
	Visit(TSI->getTypeLoc(), Sema::AbstractNone);
	// TODO: other template argument types?
	}
	}

	// Visit pointee types from a permissive context.
	#define CheckPolymorphic(Type) \
	void Check(Type TL, Sema::AbstractDiagSelID Sel) { \
	Visit(TL.getNextTypeLoc(), Sema::AbstractNone); \
	}
	CheckPolymorphic(PointerTypeLoc)
	CheckPolymorphic(ReferenceTypeLoc)
	CheckPolymorphic(MemberPointerTypeLoc)
	CheckPolymorphic(BlockPointerTypeLoc)
	CheckPolymorphic(AtomicTypeLoc)

	/// Handle all the types we haven't given a more specific
	/// implementation for above.
	void Check(TypeLoc TL, Sema::AbstractDiagSelID Sel) {
	// Every other kind of type that we haven't called out already
	// that has an inner type is either (1) sugar or (2) contains that
	// inner type in some way as a subobject.
	if (TypeLoc Next = TL.getNextTypeLoc())
	return Visit(Next, Sel);

	// If there's no inner type and we're in a permissive context,
	// don't diagnose.
	if (Sel == Sema::AbstractNone) return;

	// Check whether the type matches the abstract type.
	QualType T = TL.getType();
	if (T->isArrayType()) {
	Sel = Sema::AbstractArrayType;
	T = Info.S.Context.getBaseElementType(T);
	}
	CanQualType CT = T->getCanonicalTypeUnqualified().getUnqualifiedType();
	if (CT != Info.AbstractType) return;

	// It matched; do some magic.
	if (Sel == Sema::AbstractArrayType) {
	Info.S.Diag(Ctx->getLocation(), diag::err_array_of_abstract_type)
	<< T << TL.getSourceRange();
	} else {
	Info.S.Diag(Ctx->getLocation(), diag::err_abstract_type_in_decl)
	<< Sel << T << TL.getSourceRange();
	}
	Info.DiagnoseAbstractType();
	}
	};

	void AbstractUsageInfo::CheckType(const NamedDecl *D, TypeLoc TL,
	Sema::AbstractDiagSelID Sel) {
	CheckAbstractUsage(*this, D).Visit(TL, Sel);
	}

	}

	/// Check for invalid uses of an abstract type in a method declaration.
	static void CheckAbstractClassUsage(AbstractUsageInfo &Info,
	CXXMethodDecl *MD) {
	// No need to do the check on definitions, which require that
	// the return/param types be complete.
	if (MD->doesThisDeclarationHaveABody())
	return;

	// For safety's sake, just ignore it if we don't have type source
	// information. This should never happen for non-implicit methods,
	// but...
	if (TypeSourceInfo *TSI = MD->getTypeSourceInfo())
	Info.CheckType(MD, TSI->getTypeLoc(), Sema::AbstractNone);
	}

	/// Check for invalid uses of an abstract type within a class definition.
	static void CheckAbstractClassUsage(AbstractUsageInfo &Info,
	CXXRecordDecl *RD) {
	for (auto *D : RD->decls()) {
	if (D->isImplicit()) continue;

	// Methods and method templates.
	if (isa<CXXMethodDecl>(D)) {
	CheckAbstractClassUsage(Info, cast<CXXMethodDecl>(D));
	} else if (isa<FunctionTemplateDecl>(D)) {
	FunctionDecl *FD = cast<FunctionTemplateDecl>(D)->getTemplatedDecl();
	CheckAbstractClassUsage(Info, cast<CXXMethodDecl>(FD));

	// Fields and static variables.
	} else if (isa<FieldDecl>(D)) {
	FieldDecl *FD = cast<FieldDecl>(D);
	if (TypeSourceInfo *TSI = FD->getTypeSourceInfo())
	Info.CheckType(FD, TSI->getTypeLoc(), Sema::AbstractFieldType);
	} else if (isa<VarDecl>(D)) {
	VarDecl *VD = cast<VarDecl>(D);
	if (TypeSourceInfo *TSI = VD->getTypeSourceInfo())
	Info.CheckType(VD, TSI->getTypeLoc(), Sema::AbstractVariableType);

	// Nested classes and class templates.
	} else if (isa<CXXRecordDecl>(D)) {
	CheckAbstractClassUsage(Info, cast<CXXRecordDecl>(D));
	} else if (isa<ClassTemplateDecl>(D)) {
	CheckAbstractClassUsage(Info,
	cast<ClassTemplateDecl>(D)->getTemplatedDecl());
	}
	}
	}

	static void ReferenceDllExportedMembers(Sema &S, CXXRecordDecl *Class) {
	Attr *ClassAttr = getDLLAttr(Class);
	if (!ClassAttr)
	return;

	assert(ClassAttr->getKind() == attr::DLLExport);

	TemplateSpecializationKind TSK = Class->getTemplateSpecializationKind();

	if (TSK == TSK_ExplicitInstantiationDeclaration)
	// Don't go any further if this is just an explicit instantiation
	// declaration.
	return;

	if (S.Context.getTargetInfo().getTriple().isWindowsGNUEnvironment())
	S.MarkVTableUsed(Class->getLocation(), Class, true);

	for (Decl *Member : Class->decls()) {
	// Defined static variables that are members of an exported base
	// class must be marked export too.
	auto *VD = dyn_cast<VarDecl>(Member);
	if (VD && Member->getAttr<DLLExportAttr>() &&
	VD->getStorageClass() == SC_Static &&
	TSK == TSK_ImplicitInstantiation)
	S.MarkVariableReferenced(VD->getLocation(), VD);

	auto *MD = dyn_cast<CXXMethodDecl>(Member);
	if (!MD)
	continue;

	if (Member->getAttr<DLLExportAttr>()) {
	if (MD->isUserProvided()) {
	// Instantiate non-default class member functions ...

	// .. except for certain kinds of template specializations.
	if (TSK == TSK_ImplicitInstantiation && !ClassAttr->isInherited())
	continue;

	S.MarkFunctionReferenced(Class->getLocation(), MD);

	// The function will be passed to the consumer when its definition is
	// encountered.
	} else if (!MD->isTrivial() \|\| MD->isExplicitlyDefaulted() \|\|
	MD->isCopyAssignmentOperator() \|\|
	MD->isMoveAssignmentOperator()) {
	// Synthesize and instantiate non-trivial implicit methods, explicitly
	// defaulted methods, and the copy and move assignment operators. The
	// latter are exported even if they are trivial, because the address of
	// an operator can be taken and should compare equal across libraries.
	DiagnosticErrorTrap Trap(S.Diags);
	S.MarkFunctionReferenced(Class->getLocation(), MD);
	if (Trap.hasErrorOccurred()) {
	S.Diag(ClassAttr->getLocation(), diag::note_due_to_dllexported_class)
	<< Class << !S.getLangOpts().CPlusPlus11;
	break;
	}

	// There is no later point when we will see the definition of this
	// function, so pass it to the consumer now.
	S.Consumer.HandleTopLevelDecl(DeclGroupRef(MD));
	}
	}
	}
	}

	static void checkForMultipleExportedDefaultConstructors(Sema &S,
	CXXRecordDecl *Class) {
	// Only the MS ABI has default constructor closures, so we don't need to do
	// this semantic checking anywhere else.
	if (!S.Context.getTargetInfo().getCXXABI().isMicrosoft())
	return;

	CXXConstructorDecl *LastExportedDefaultCtor = nullptr;
	for (Decl *Member : Class->decls()) {
	// Look for exported default constructors.
	auto *CD = dyn_cast<CXXConstructorDecl>(Member);
	if (!CD \|\| !CD->isDefaultConstructor())
	continue;
	auto *Attr = CD->getAttr<DLLExportAttr>();
	if (!Attr)
	continue;

	// If the class is non-dependent, mark the default arguments as ODR-used so
	// that we can properly codegen the constructor closure.
	if (!Class->isDependentContext()) {
	for (ParmVarDecl *PD : CD->parameters()) {
	(void)S.CheckCXXDefaultArgExpr(Attr->getLocation(), CD, PD);
	S.DiscardCleanupsInEvaluationContext();
	}
	}

	if (LastExportedDefaultCtor) {
	S.Diag(LastExportedDefaultCtor->getLocation(),
	diag::err_attribute_dll_ambiguous_default_ctor)
	<< Class;
	S.Diag(CD->getLocation(), diag::note_entity_declared_at)
	<< CD->getDeclName();
	return;
	}
	LastExportedDefaultCtor = CD;
	}
	}

	void Sema::checkClassLevelCodeSegAttribute(CXXRecordDecl *Class) {
	// Mark any compiler-generated routines with the implicit code_seg attribute.
	for (auto *Method : Class->methods()) {
	if (Method->isUserProvided())
	continue;
	if (Attr A = getImplicitCodeSegOrSectionAttrForFunction(Method, /IsDefinition=*/true))
	Method->addAttr(A);
	}
	}

	/// Check class-level dllimport/dllexport attribute.
	void Sema::checkClassLevelDLLAttribute(CXXRecordDecl *Class) {
	Attr *ClassAttr = getDLLAttr(Class);

	// MSVC inherits DLL attributes to partial class template specializations.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft() && !ClassAttr) {
	if (auto *Spec = dyn_cast<ClassTemplatePartialSpecializationDecl>(Class)) {
	if (Attr *TemplateAttr =
	getDLLAttr(Spec->getSpecializedTemplate()->getTemplatedDecl())) {
	auto *A = cast<InheritableAttr>(TemplateAttr->clone(getASTContext()));
	A->setInherited(true);
	ClassAttr = A;
	}
	}
	}

	if (!ClassAttr)
	return;

	if (!Class->isExternallyVisible()) {
	Diag(Class->getLocation(), diag::err_attribute_dll_not_extern)
	<< Class << ClassAttr;
	return;
	}

	if (Context.getTargetInfo().getCXXABI().isMicrosoft() &&
	!ClassAttr->isInherited()) {
	// Diagnose dll attributes on members of class with dll attribute.
	for (Decl *Member : Class->decls()) {
	if (!isa<VarDecl>(Member) && !isa<CXXMethodDecl>(Member))
	continue;
	InheritableAttr *MemberAttr = getDLLAttr(Member);
	if (!MemberAttr \|\| MemberAttr->isInherited() \|\| Member->isInvalidDecl())
	continue;

	Diag(MemberAttr->getLocation(),
	diag::err_attribute_dll_member_of_dll_class)
	<< MemberAttr << ClassAttr;
	Diag(ClassAttr->getLocation(), diag::note_previous_attribute);
	Member->setInvalidDecl();
	}
	}

	if (Class->getDescribedClassTemplate())
	// Don't inherit dll attribute until the template is instantiated.
	return;

	// The class is either imported or exported.
	const bool ClassExported = ClassAttr->getKind() == attr::DLLExport;

	// Check if this was a dllimport attribute propagated from a derived class to
	// a base class template specialization. We don't apply these attributes to
	// static data members.
	const bool PropagatedImport =
	!ClassExported &&
	cast<DLLImportAttr>(ClassAttr)->wasPropagatedToBaseTemplate();

	TemplateSpecializationKind TSK = Class->getTemplateSpecializationKind();

	// Ignore explicit dllexport on explicit class template instantiation
	// declarations, except in MinGW mode.
	if (ClassExported && !ClassAttr->isInherited() &&
	TSK == TSK_ExplicitInstantiationDeclaration &&
	!Context.getTargetInfo().getTriple().isWindowsGNUEnvironment()) {
	Class->dropAttr<DLLExportAttr>();
	return;
	}

	// Force declaration of implicit members so they can inherit the attribute.
	ForceDeclarationOfImplicitMembers(Class);

	// FIXME: MSVC's docs say all bases must be exportable, but this doesn't
	// seem to be true in practice?

	for (Decl *Member : Class->decls()) {
	VarDecl *VD = dyn_cast<VarDecl>(Member);
	CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(Member);

	// Only methods and static fields inherit the attributes.
	if (!VD && !MD)
	continue;

	if (MD) {
	// Don't process deleted methods.
	if (MD->isDeleted())
	continue;

	if (MD->isInlined()) {
	// MinGW does not import or export inline methods. But do it for
	// template instantiations.
	if (!Context.getTargetInfo().getCXXABI().isMicrosoft() &&
	!Context.getTargetInfo().getTriple().isWindowsItaniumEnvironment() &&
	TSK != TSK_ExplicitInstantiationDeclaration &&
	TSK != TSK_ExplicitInstantiationDefinition)
	continue;

	// MSVC versions before 2015 don't export the move assignment operators
	// and move constructor, so don't attempt to import/export them if
	// we have a definition.
	auto *Ctor = dyn_cast<CXXConstructorDecl>(MD);
	if ((MD->isMoveAssignmentOperator() \|\|
	(Ctor && Ctor->isMoveConstructor())) &&
	!getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015))
	continue;

	// MSVC2015 doesn't export trivial defaulted x-tor but copy assign
	// operator is exported anyway.
	if (getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015) &&
	(Ctor \|\| isa<CXXDestructorDecl>(MD)) && MD->isTrivial())
	continue;
	}
	}

	// Don't apply dllimport attributes to static data members of class template
	// instantiations when the attribute is propagated from a derived class.
	if (VD && PropagatedImport)
	continue;

	if (!cast<NamedDecl>(Member)->isExternallyVisible())
	continue;

	if (!getDLLAttr(Member)) {
	InheritableAttr *NewAttr = nullptr;

	// Do not export/import inline function when -fno-dllexport-inlines is
	// passed. But add attribute for later local static var check.
	if (!getLangOpts().DllExportInlines && MD && MD->isInlined() &&
	TSK != TSK_ExplicitInstantiationDeclaration &&
	TSK != TSK_ExplicitInstantiationDefinition) {
	if (ClassExported) {
	NewAttr = ::new (getASTContext())
	DLLExportStaticLocalAttr(ClassAttr->getRange(),
	getASTContext(),
	ClassAttr->getSpellingListIndex());
	} else {
	NewAttr = ::new (getASTContext())
	DLLImportStaticLocalAttr(ClassAttr->getRange(),
	getASTContext(),
	ClassAttr->getSpellingListIndex());
	}
	} else {
	NewAttr = cast<InheritableAttr>(ClassAttr->clone(getASTContext()));
	}

	NewAttr->setInherited(true);
	Member->addAttr(NewAttr);

	if (MD) {
	// Propagate DLLAttr to friend re-declarations of MD that have already
	// been constructed.
	for (FunctionDecl *FD = MD->getMostRecentDecl(); FD;
	FD = FD->getPreviousDecl()) {
	if (FD->getFriendObjectKind() == Decl::FOK_None)
	continue;
	assert(!getDLLAttr(FD) &&
	"friend re-decl should not already have a DLLAttr");
	NewAttr = cast<InheritableAttr>(ClassAttr->clone(getASTContext()));
	NewAttr->setInherited(true);
	FD->addAttr(NewAttr);
	}
	}
	}
	}

	if (ClassExported)
	DelayedDllExportClasses.push_back(Class);
	}

	/// Perform propagation of DLL attributes from a derived class to a
	/// templated base class for MS compatibility.
	void Sema::propagateDLLAttrToBaseClassTemplate(
	CXXRecordDecl Class, Attr ClassAttr,
	ClassTemplateSpecializationDecl *BaseTemplateSpec, SourceLocation BaseLoc) {
	if (getDLLAttr(
	BaseTemplateSpec->getSpecializedTemplate()->getTemplatedDecl())) {
	// If the base class template has a DLL attribute, don't try to change it.
	return;
	}

	auto TSK = BaseTemplateSpec->getSpecializationKind();
	if (!getDLLAttr(BaseTemplateSpec) &&
	(TSK == TSK_Undeclared \|\| TSK == TSK_ExplicitInstantiationDeclaration \|\|
	TSK == TSK_ImplicitInstantiation)) {
	// The template hasn't been instantiated yet (or it has, but only as an
	// explicit instantiation declaration or implicit instantiation, which means
	// we haven't codegenned any members yet), so propagate the attribute.
	auto *NewAttr = cast<InheritableAttr>(ClassAttr->clone(getASTContext()));
	NewAttr->setInherited(true);
	BaseTemplateSpec->addAttr(NewAttr);

	// If this was an import, mark that we propagated it from a derived class to
	// a base class template specialization.
	if (auto *ImportAttr = dyn_cast<DLLImportAttr>(NewAttr))
	ImportAttr->setPropagatedToBaseTemplate();

	// If the template is already instantiated, checkDLLAttributeRedeclaration()
	// needs to be run again to work see the new attribute. Otherwise this will
	// get run whenever the template is instantiated.
	if (TSK != TSK_Undeclared)
	checkClassLevelDLLAttribute(BaseTemplateSpec);

	return;
	}

	if (getDLLAttr(BaseTemplateSpec)) {
	// The template has already been specialized or instantiated with an
	// attribute, explicitly or through propagation. We should not try to change
	// it.
	return;
	}

	// The template was previously instantiated or explicitly specialized without
	// a dll attribute, It's too late for us to add an attribute, so warn that
	// this is unsupported.
	Diag(BaseLoc, diag::warn_attribute_dll_instantiated_base_class)
	<< BaseTemplateSpec->isExplicitSpecialization();
	Diag(ClassAttr->getLocation(), diag::note_attribute);
	if (BaseTemplateSpec->isExplicitSpecialization()) {
	Diag(BaseTemplateSpec->getLocation(),
	diag::note_template_class_explicit_specialization_was_here)
	<< BaseTemplateSpec;
	} else {
	Diag(BaseTemplateSpec->getPointOfInstantiation(),
	diag::note_template_class_instantiation_was_here)
	<< BaseTemplateSpec;
	}
	}

	static void DefineImplicitSpecialMember(Sema &S, CXXMethodDecl *MD,
	SourceLocation DefaultLoc) {
	switch (S.getSpecialMember(MD)) {
	case Sema::CXXDefaultConstructor:
	S.DefineImplicitDefaultConstructor(DefaultLoc,
	cast<CXXConstructorDecl>(MD));
	break;
	case Sema::CXXCopyConstructor:
	S.DefineImplicitCopyConstructor(DefaultLoc, cast<CXXConstructorDecl>(MD));
	break;
	case Sema::CXXCopyAssignment:
	S.DefineImplicitCopyAssignment(DefaultLoc, MD);
	break;
	case Sema::CXXDestructor:
	S.DefineImplicitDestructor(DefaultLoc, cast<CXXDestructorDecl>(MD));
	break;
	case Sema::CXXMoveConstructor:
	S.DefineImplicitMoveConstructor(DefaultLoc, cast<CXXConstructorDecl>(MD));
	break;
	case Sema::CXXMoveAssignment:
	S.DefineImplicitMoveAssignment(DefaultLoc, MD);
	break;
	case Sema::CXXInvalid:
	llvm_unreachable("Invalid special member.");
	}
	}

	/// Determine whether a type is permitted to be passed or returned in
	/// registers, per C++ [class.temporary]p3.
	static bool canPassInRegisters(Sema &S, CXXRecordDecl *D,
	TargetInfo::CallingConvKind CCK) {
	if (D->isDependentType() \|\| D->isInvalidDecl())
	return false;

	// Clang <= 4 used the pre-C++11 rule, which ignores move operations.
	// The PS4 platform ABI follows the behavior of Clang 3.2.
	if (CCK == TargetInfo::CCK_ClangABI4OrPS4)
	return !D->hasNonTrivialDestructorForCall() &&
	!D->hasNonTrivialCopyConstructorForCall();

	if (CCK == TargetInfo::CCK_MicrosoftWin64) {
	bool CopyCtorIsTrivial = false, CopyCtorIsTrivialForCall = false;
	bool DtorIsTrivialForCall = false;

	// If a class has at least one non-deleted, trivial copy constructor, it
	// is passed according to the C ABI. Otherwise, it is passed indirectly.
	//
	// Note: This permits classes with non-trivial copy or move ctors to be
	// passed in registers, so long as they also have a trivial copy ctor,
	// which is non-conforming.
	if (D->needsImplicitCopyConstructor()) {
	if (!D->defaultedCopyConstructorIsDeleted()) {
	if (D->hasTrivialCopyConstructor())
	CopyCtorIsTrivial = true;
	if (D->hasTrivialCopyConstructorForCall())
	CopyCtorIsTrivialForCall = true;
	}
	} else {
	for (const CXXConstructorDecl *CD : D->ctors()) {
	if (CD->isCopyConstructor() && !CD->isDeleted()) {
	if (CD->isTrivial())
	CopyCtorIsTrivial = true;
	if (CD->isTrivialForCall())
	CopyCtorIsTrivialForCall = true;
	}
	}
	}

	if (D->needsImplicitDestructor()) {
	if (!D->defaultedDestructorIsDeleted() &&
	D->hasTrivialDestructorForCall())
	DtorIsTrivialForCall = true;
	} else if (const auto *DD = D->getDestructor()) {
	if (!DD->isDeleted() && DD->isTrivialForCall())
	DtorIsTrivialForCall = true;
	}

	// If the copy ctor and dtor are both trivial-for-calls, pass direct.
	if (CopyCtorIsTrivialForCall && DtorIsTrivialForCall)
	return true;

	// If a class has a destructor, we'd really like to pass it indirectly
	// because it allows us to elide copies. Unfortunately, MSVC makes that
	// impossible for small types, which it will pass in a single register or
	// stack slot. Most objects with dtors are large-ish, so handle that early.
	// We can't call out all large objects as being indirect because there are
	// multiple x64 calling conventions and the C++ ABI code shouldn't dictate
	// how we pass large POD types.

	// Note: This permits small classes with nontrivial destructors to be
	// passed in registers, which is non-conforming.
	bool isAArch64 = S.Context.getTargetInfo().getTriple().isAArch64();
	uint64_t TypeSize = isAArch64 ? 128 : 64;

	if (CopyCtorIsTrivial &&
	S.getASTContext().getTypeSize(D->getTypeForDecl()) <= TypeSize)
	return true;
	return false;
	}

	// Per C++ [class.temporary]p3, the relevant condition is:
	// each copy constructor, move constructor, and destructor of X is
	// either trivial or deleted, and X has at least one non-deleted copy
	// or move constructor
	bool HasNonDeletedCopyOrMove = false;

	if (D->needsImplicitCopyConstructor() &&
	!D->defaultedCopyConstructorIsDeleted()) {
	if (!D->hasTrivialCopyConstructorForCall())
	return false;
	HasNonDeletedCopyOrMove = true;
	}

	if (S.getLangOpts().CPlusPlus11 && D->needsImplicitMoveConstructor() &&
	!D->defaultedMoveConstructorIsDeleted()) {
	if (!D->hasTrivialMoveConstructorForCall())
	return false;
	HasNonDeletedCopyOrMove = true;
	}

	if (D->needsImplicitDestructor() && !D->defaultedDestructorIsDeleted() &&
	!D->hasTrivialDestructorForCall())
	return false;

	for (const CXXMethodDecl *MD : D->methods()) {
	if (MD->isDeleted())
	continue;

	auto *CD = dyn_cast<CXXConstructorDecl>(MD);
	if (CD && CD->isCopyOrMoveConstructor())
	HasNonDeletedCopyOrMove = true;
	else if (!isa<CXXDestructorDecl>(MD))
	continue;

	if (!MD->isTrivialForCall())
	return false;
	}

	return HasNonDeletedCopyOrMove;
	}

	/// Perform semantic checks on a class definition that has been
	/// completing, introducing implicitly-declared members, checking for
	/// abstract types, etc.
	void Sema::CheckCompletedCXXClass(CXXRecordDecl *Record) {
	if (!Record)
	return;

	if (Record->isAbstract() && !Record->isInvalidDecl()) {
	AbstractUsageInfo Info(*this, Record);
	CheckAbstractClassUsage(Info, Record);
	}

	// If this is not an aggregate type and has no user-declared constructor,
	// complain about any non-static data members of reference or const scalar
	// type, since they will never get initializers.
	if (!Record->isInvalidDecl() && !Record->isDependentType() &&
	!Record->isAggregate() && !Record->hasUserDeclaredConstructor() &&
	!Record->isLambda()) {
	bool Complained = false;
	for (const auto *F : Record->fields()) {
	if (F->hasInClassInitializer() \|\| F->isUnnamedBitfield())
	continue;

	if (F->getType()->isReferenceType() \|\|
	(F->getType().isConstQualified() && F->getType()->isScalarType())) {
	if (!Complained) {
	Diag(Record->getLocation(), diag::warn_no_constructor_for_refconst)
	<< Record->getTagKind() << Record;
	Complained = true;
	}

	Diag(F->getLocation(), diag::note_refconst_member_not_initialized)
	<< F->getType()->isReferenceType()
	<< F->getDeclName();
	}
	}
	}

	if (Record->getIdentifier()) {
	// C++ [class.mem]p13:
	// If T is the name of a class, then each of the following shall have a
	// name different from T:
	// - every member of every anonymous union that is a member of class T.
	//
	// C++ [class.mem]p14:
	// In addition, if class T has a user-declared constructor (12.1), every
	// non-static data member of class T shall have a name different from T.
	DeclContext::lookup_result R = Record->lookup(Record->getDeclName());
	for (DeclContext::lookup_iterator I = R.begin(), E = R.end(); I != E;
	++I) {
	NamedDecl D = (I)->getUnderlyingDecl();
	if (((isa<FieldDecl>(D) \|\| isa<UnresolvedUsingValueDecl>(D)) &&
	Record->hasUserDeclaredConstructor()) \|\|
	isa<IndirectFieldDecl>(D)) {
	Diag((*I)->getLocation(), diag::err_member_name_of_class)
	<< D->getDeclName();
	break;
	}
	}
	}

	// Warn if the class has virtual methods but non-virtual public destructor.
	if (Record->isPolymorphic() && !Record->isDependentType()) {
	CXXDestructorDecl *dtor = Record->getDestructor();
	if ((!dtor \|\| (!dtor->isVirtual() && dtor->getAccess() == AS_public)) &&
	!Record->hasAttr<FinalAttr>())
	Diag(dtor ? dtor->getLocation() : Record->getLocation(),
	diag::warn_non_virtual_dtor) << Context.getRecordType(Record);
	}

	if (Record->isAbstract()) {
	if (FinalAttr *FA = Record->getAttr<FinalAttr>()) {
	Diag(Record->getLocation(), diag::warn_abstract_final_class)
	<< FA->isSpelledAsSealed();
	DiagnoseAbstractType(Record);
	}
	}

	// See if trivial_abi has to be dropped.
	if (Record->hasAttr<TrivialABIAttr>())
	checkIllFormedTrivialABIStruct(*Record);

	// Set HasTrivialSpecialMemberForCall if the record has attribute
	// "trivial_abi".
	bool HasTrivialABI = Record->hasAttr<TrivialABIAttr>();

	if (HasTrivialABI)
	Record->setHasTrivialSpecialMemberForCall();

	auto CompleteMemberFunction = [&](CXXMethodDecl *M) {
	// Check whether the explicitly-defaulted special members are valid.
	if (!M->isInvalidDecl() && M->isExplicitlyDefaulted())
	CheckExplicitlyDefaultedSpecialMember(M);

	// For an explicitly defaulted or deleted special member, we defer
	// determining triviality until the class is complete. That time is now!
	CXXSpecialMember CSM = getSpecialMember(M);
	if (!M->isImplicit() && !M->isUserProvided()) {
	if (CSM != CXXInvalid) {
	M->setTrivial(SpecialMemberIsTrivial(M, CSM));
	// Inform the class that we've finished declaring this member.
	Record->finishedDefaultedOrDeletedMember(M);
	M->setTrivialForCall(
	HasTrivialABI \|\|
	SpecialMemberIsTrivial(M, CSM, TAH_ConsiderTrivialABI));
	Record->setTrivialForCallFlags(M);
	}
	}

	// Set triviality for the purpose of calls if this is a user-provided
	// copy/move constructor or destructor.
	if ((CSM == CXXCopyConstructor \|\| CSM == CXXMoveConstructor \|\|
	CSM == CXXDestructor) && M->isUserProvided()) {
	M->setTrivialForCall(HasTrivialABI);
	Record->setTrivialForCallFlags(M);
	}

	if (!M->isInvalidDecl() && M->isExplicitlyDefaulted() &&
	M->hasAttr<DLLExportAttr>()) {
	if (getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015) &&
	M->isTrivial() &&
	(CSM == CXXDefaultConstructor \|\| CSM == CXXCopyConstructor \|\|
	CSM == CXXDestructor))
	M->dropAttr<DLLExportAttr>();

	if (M->hasAttr<DLLExportAttr>()) {
	// Define after any fields with in-class initializers have been parsed.
	DelayedDllExportMemberFunctions.push_back(M);
	}
	}
	};

	bool HasMethodWithOverrideControl = false,
	HasOverridingMethodWithoutOverrideControl = false;
	if (!Record->isDependentType()) {
	// Check the destructor before any other member function. We need to
	// determine whether it's trivial in order to determine whether the claas
	// type is a literal type, which is a prerequisite for determining whether
	// other special member functions are valid and whether they're implicitly
	// 'constexpr'.
	if (CXXDestructorDecl *Dtor = Record->getDestructor())
	CompleteMemberFunction(Dtor);

	for (auto *M : Record->methods()) {
	// See if a method overloads virtual methods in a base
	// class without overriding any.
	if (!M->isStatic())
	DiagnoseHiddenVirtualMethods(M);
	if (M->hasAttr<OverrideAttr>())
	HasMethodWithOverrideControl = true;
	else if (M->size_overridden_methods() > 0)
	HasOverridingMethodWithoutOverrideControl = true;

	if (!isa<CXXDestructorDecl>(M))
	CompleteMemberFunction(M);
	}
	}

	if (HasMethodWithOverrideControl &&
	HasOverridingMethodWithoutOverrideControl) {
	// At least one method has the 'override' control declared.
	// Diagnose all other overridden methods which do not have 'override' specified on them.
	for (auto *M : Record->methods())
	DiagnoseAbsenceOfOverrideControl(M);
	}

	// ms_struct is a request to use the same ABI rules as MSVC. Check
	// whether this class uses any C++ features that are implemented
	// completely differently in MSVC, and if so, emit a diagnostic.
	// That diagnostic defaults to an error, but we allow projects to
	// map it down to a warning (or ignore it). It's a fairly common
	// practice among users of the ms_struct pragma to mass-annotate
	// headers, sweeping up a bunch of types that the project doesn't
	// really rely on MSVC-compatible layout for. We must therefore
	// support "ms_struct except for C++ stuff" as a secondary ABI.
	if (Record->isMsStruct(Context) &&
	(Record->isPolymorphic() \|\| Record->getNumBases())) {
	Diag(Record->getLocation(), diag::warn_cxx_ms_struct);
	}

	checkClassLevelDLLAttribute(Record);
	checkClassLevelCodeSegAttribute(Record);

	bool ClangABICompat4 =
	Context.getLangOpts().getClangABICompat() <= LangOptions::ClangABI::Ver4;
	TargetInfo::CallingConvKind CCK =
	Context.getTargetInfo().getCallingConvKind(ClangABICompat4);
	bool CanPass = canPassInRegisters(*this, Record, CCK);

	// Do not change ArgPassingRestrictions if it has already been set to
	// APK_CanNeverPassInRegs.
	if (Record->getArgPassingRestrictions() != RecordDecl::APK_CanNeverPassInRegs)
	Record->setArgPassingRestrictions(CanPass
	? RecordDecl::APK_CanPassInRegs
	: RecordDecl::APK_CannotPassInRegs);

	// If canPassInRegisters returns true despite the record having a non-trivial
	// destructor, the record is destructed in the callee. This happens only when
	// the record or one of its subobjects has a field annotated with trivial_abi
	// or a field qualified with ObjC __strong/__weak.
	if (Context.getTargetInfo().getCXXABI().areArgsDestroyedLeftToRightInCallee())
	Record->setParamDestroyedInCallee(true);
	else if (Record->hasNonTrivialDestructor())
	Record->setParamDestroyedInCallee(CanPass);

	if (getLangOpts().ForceEmitVTables) {
	// If we want to emit all the vtables, we need to mark it as used. This
	// is especially required for cases like vtable assumption loads.
	MarkVTableUsed(Record->getInnerLocStart(), Record);
	}
	}

	/// Look up the special member function that would be called by a special
	/// member function for a subobject of class type.
	///
	/// \param Class The class type of the subobject.
	/// \param CSM The kind of special member function.
	/// \param FieldQuals If the subobject is a field, its cv-qualifiers.
	/// \param ConstRHS True if this is a copy operation with a const object
	/// on its RHS, that is, if the argument to the outer special member
	/// function is 'const' and this is not a field marked 'mutable'.
	static Sema::SpecialMemberOverloadResult lookupCallFromSpecialMember(
	Sema &S, CXXRecordDecl *Class, Sema::CXXSpecialMember CSM,
	unsigned FieldQuals, bool ConstRHS) {
	unsigned LHSQuals = 0;
	if (CSM == Sema::CXXCopyAssignment \|\| CSM == Sema::CXXMoveAssignment)
	LHSQuals = FieldQuals;

	unsigned RHSQuals = FieldQuals;
	if (CSM == Sema::CXXDefaultConstructor \|\| CSM == Sema::CXXDestructor)
	RHSQuals = 0;
	else if (ConstRHS)
	RHSQuals \|= Qualifiers::Const;

	return S.LookupSpecialMember(Class, CSM,
	RHSQuals & Qualifiers::Const,
	RHSQuals & Qualifiers::Volatile,
	false,
	LHSQuals & Qualifiers::Const,
	LHSQuals & Qualifiers::Volatile);
	}

	class Sema::InheritedConstructorInfo {
	Sema &S;
	SourceLocation UseLoc;

	/// A mapping from the base classes through which the constructor was
	/// inherited to the using shadow declaration in that base class (or a null
	/// pointer if the constructor was declared in that base class).
	llvm::DenseMap<CXXRecordDecl , ConstructorUsingShadowDecl >
	InheritedFromBases;

	public:
	InheritedConstructorInfo(Sema &S, SourceLocation UseLoc,
	ConstructorUsingShadowDecl *Shadow)
	: S(S), UseLoc(UseLoc) {
	bool DiagnosedMultipleConstructedBases = false;
	CXXRecordDecl *ConstructedBase = nullptr;
	UsingDecl *ConstructedBaseUsing = nullptr;

	// Find the set of such base class subobjects and check that there's a
	// unique constructed subobject.
	for (auto *D : Shadow->redecls()) {
	auto *DShadow = cast<ConstructorUsingShadowDecl>(D);
	auto *DNominatedBase = DShadow->getNominatedBaseClass();
	auto *DConstructedBase = DShadow->getConstructedBaseClass();

	InheritedFromBases.insert(
	std::make_pair(DNominatedBase->getCanonicalDecl(),
	DShadow->getNominatedBaseClassShadowDecl()));
	if (DShadow->constructsVirtualBase())
	InheritedFromBases.insert(
	std::make_pair(DConstructedBase->getCanonicalDecl(),
	DShadow->getConstructedBaseClassShadowDecl()));
	else
	assert(DNominatedBase == DConstructedBase);

	// [class.inhctor.init]p2:
	// If the constructor was inherited from multiple base class subobjects
	// of type B, the program is ill-formed.
	if (!ConstructedBase) {
	ConstructedBase = DConstructedBase;
	ConstructedBaseUsing = D->getUsingDecl();
	} else if (ConstructedBase != DConstructedBase &&
	!Shadow->isInvalidDecl()) {
	if (!DiagnosedMultipleConstructedBases) {
	S.Diag(UseLoc, diag::err_ambiguous_inherited_constructor)
	<< Shadow->getTargetDecl();
	S.Diag(ConstructedBaseUsing->getLocation(),
	diag::note_ambiguous_inherited_constructor_using)
	<< ConstructedBase;
	DiagnosedMultipleConstructedBases = true;
	}
	S.Diag(D->getUsingDecl()->getLocation(),
	diag::note_ambiguous_inherited_constructor_using)
	<< DConstructedBase;
	}
	}

	if (DiagnosedMultipleConstructedBases)
	Shadow->setInvalidDecl();
	}

	/// Find the constructor to use for inherited construction of a base class,
	/// and whether that base class constructor inherits the constructor from a
	/// virtual base class (in which case it won't actually invoke it).
	std::pair<CXXConstructorDecl *, bool>
	findConstructorForBase(CXXRecordDecl Base, CXXConstructorDecl Ctor) const {
	auto It = InheritedFromBases.find(Base->getCanonicalDecl());
	if (It == InheritedFromBases.end())
	return std::make_pair(nullptr, false);

	// This is an intermediary class.
	if (It->second)
	return std::make_pair(
	S.findInheritingConstructor(UseLoc, Ctor, It->second),
	It->second->constructsVirtualBase());

	// This is the base class from which the constructor was inherited.
	return std::make_pair(Ctor, false);
	}
	};

	/// Is the special member function which would be selected to perform the
	/// specified operation on the specified class type a constexpr constructor?
	static bool
	specialMemberIsConstexpr(Sema &S, CXXRecordDecl *ClassDecl,
	Sema::CXXSpecialMember CSM, unsigned Quals,
	bool ConstRHS,
	CXXConstructorDecl *InheritedCtor = nullptr,
	Sema::InheritedConstructorInfo *Inherited = nullptr) {
	// If we're inheriting a constructor, see if we need to call it for this base
	// class.
	if (InheritedCtor) {
	assert(CSM == Sema::CXXDefaultConstructor);
	auto BaseCtor =
	Inherited->findConstructorForBase(ClassDecl, InheritedCtor).first;
	if (BaseCtor)
	return BaseCtor->isConstexpr();
	}

	if (CSM == Sema::CXXDefaultConstructor)
	return ClassDecl->hasConstexprDefaultConstructor();

	Sema::SpecialMemberOverloadResult SMOR =
	lookupCallFromSpecialMember(S, ClassDecl, CSM, Quals, ConstRHS);
	if (!SMOR.getMethod())
	// A constructor we wouldn't select can't be "involved in initializing"
	// anything.
	return true;
	return SMOR.getMethod()->isConstexpr();
	}

	/// Determine whether the specified special member function would be constexpr
	/// if it were implicitly defined.
	static bool defaultedSpecialMemberIsConstexpr(
	Sema &S, CXXRecordDecl *ClassDecl, Sema::CXXSpecialMember CSM,
	bool ConstArg, CXXConstructorDecl *InheritedCtor = nullptr,
	Sema::InheritedConstructorInfo *Inherited = nullptr) {
	if (!S.getLangOpts().CPlusPlus11)
	return false;

	// C++11 [dcl.constexpr]p4:
	// In the definition of a constexpr constructor [...]
	bool Ctor = true;
	switch (CSM) {
	case Sema::CXXDefaultConstructor:
	if (Inherited)
	break;
	// Since default constructor lookup is essentially trivial (and cannot
	// involve, for instance, template instantiation), we compute whether a
	// defaulted default constructor is constexpr directly within CXXRecordDecl.
	//
	// This is important for performance; we need to know whether the default
	// constructor is constexpr to determine whether the type is a literal type.
	return ClassDecl->defaultedDefaultConstructorIsConstexpr();

	case Sema::CXXCopyConstructor:
	case Sema::CXXMoveConstructor:
	// For copy or move constructors, we need to perform overload resolution.
	break;

	case Sema::CXXCopyAssignment:
	case Sema::CXXMoveAssignment:
	if (!S.getLangOpts().CPlusPlus14)
	return false;
	// In C++1y, we need to perform overload resolution.
	Ctor = false;
	break;

	case Sema::CXXDestructor:
	case Sema::CXXInvalid:
	return false;
	}

	// -- if the class is a non-empty union, or for each non-empty anonymous
	// union member of a non-union class, exactly one non-static data member
	// shall be initialized; [DR1359]
	//
	// If we squint, this is guaranteed, since exactly one non-static data member
	// will be initialized (if the constructor isn't deleted), we just don't know
	// which one.
	if (Ctor && ClassDecl->isUnion())
	return CSM == Sema::CXXDefaultConstructor
	? ClassDecl->hasInClassInitializer() \|\|
	!ClassDecl->hasVariantMembers()
	: true;

	// -- the class shall not have any virtual base classes;
	if (Ctor && ClassDecl->getNumVBases())
	return false;

	// C++1y [class.copy]p26:
	// -- [the class] is a literal type, and
	if (!Ctor && !ClassDecl->isLiteral())
	return false;

	// -- every constructor involved in initializing [...] base class
	// sub-objects shall be a constexpr constructor;
	// -- the assignment operator selected to copy/move each direct base
	// class is a constexpr function, and
	for (const auto &B : ClassDecl->bases()) {
	const RecordType *BaseType = B.getType()->getAs<RecordType>();
	if (!BaseType) continue;

	CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(BaseType->getDecl());
	if (!specialMemberIsConstexpr(S, BaseClassDecl, CSM, 0, ConstArg,
	InheritedCtor, Inherited))
	return false;
	}

	// -- every constructor involved in initializing non-static data members
	// [...] shall be a constexpr constructor;
	// -- every non-static data member and base class sub-object shall be
	// initialized
	// -- for each non-static data member of X that is of class type (or array
	// thereof), the assignment operator selected to copy/move that member is
	// a constexpr function
	for (const auto *F : ClassDecl->fields()) {
	if (F->isInvalidDecl())
	continue;
	if (CSM == Sema::CXXDefaultConstructor && F->hasInClassInitializer())
	continue;
	QualType BaseType = S.Context.getBaseElementType(F->getType());
	if (const RecordType *RecordTy = BaseType->getAs<RecordType>()) {
	CXXRecordDecl *FieldRecDecl = cast<CXXRecordDecl>(RecordTy->getDecl());
	if (!specialMemberIsConstexpr(S, FieldRecDecl, CSM,
	BaseType.getCVRQualifiers(),
	ConstArg && !F->isMutable()))
	return false;
	} else if (CSM == Sema::CXXDefaultConstructor) {
	return false;
	}
	}

	// All OK, it's constexpr!
	return true;
	}

	static Sema::ImplicitExceptionSpecification
	ComputeDefaultedSpecialMemberExceptionSpec(
	Sema &S, SourceLocation Loc, CXXMethodDecl *MD, Sema::CXXSpecialMember CSM,
	Sema::InheritedConstructorInfo *ICI);

	static Sema::ImplicitExceptionSpecification
	computeImplicitExceptionSpec(Sema &S, SourceLocation Loc, CXXMethodDecl *MD) {
	auto CSM = S.getSpecialMember(MD);
	if (CSM != Sema::CXXInvalid)
	return ComputeDefaultedSpecialMemberExceptionSpec(S, Loc, MD, CSM, nullptr);

	auto *CD = cast<CXXConstructorDecl>(MD);
	assert(CD->getInheritedConstructor() &&
	"only special members have implicit exception specs");
	Sema::InheritedConstructorInfo ICI(
	S, Loc, CD->getInheritedConstructor().getShadowDecl());
	return ComputeDefaultedSpecialMemberExceptionSpec(
	S, Loc, CD, Sema::CXXDefaultConstructor, &ICI);
	}

	static FunctionProtoType::ExtProtoInfo getImplicitMethodEPI(Sema &S,
	CXXMethodDecl *MD) {
	FunctionProtoType::ExtProtoInfo EPI;

	// Build an exception specification pointing back at this member.
	EPI.ExceptionSpec.Type = EST_Unevaluated;
	EPI.ExceptionSpec.SourceDecl = MD;

	// Set the calling convention to the default for C++ instance methods.
	EPI.ExtInfo = EPI.ExtInfo.withCallingConv(
	S.Context.getDefaultCallingConvention(/IsVariadic=/false,
	/IsCXXMethod=/true));
	return EPI;
	}

	void Sema::EvaluateImplicitExceptionSpec(SourceLocation Loc, CXXMethodDecl *MD) {
	const FunctionProtoType *FPT = MD->getType()->castAs<FunctionProtoType>();
	if (FPT->getExceptionSpecType() != EST_Unevaluated)
	return;

	// Evaluate the exception specification.
	auto IES = computeImplicitExceptionSpec(*this, Loc, MD);
	auto ESI = IES.getExceptionSpec();

	// Update the type of the special member to use it.
	UpdateExceptionSpec(MD, ESI);

	// A user-provided destructor can be defined outside the class. When that
	// happens, be sure to update the exception specification on both
	// declarations.
	const FunctionProtoType *CanonicalFPT =
	MD->getCanonicalDecl()->getType()->castAs<FunctionProtoType>();
	if (CanonicalFPT->getExceptionSpecType() == EST_Unevaluated)
	UpdateExceptionSpec(MD->getCanonicalDecl(), ESI);
	}

	void Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD) {
	CXXRecordDecl *RD = MD->getParent();
	CXXSpecialMember CSM = getSpecialMember(MD);

	assert(MD->isExplicitlyDefaulted() && CSM != CXXInvalid &&
	"not an explicitly-defaulted special member");

	// Whether this was the first-declared instance of the constructor.
	// This affects whether we implicitly add an exception spec and constexpr.
	bool First = MD == MD->getCanonicalDecl();

	bool HadError = false;

	// C++11 [dcl.fct.def.default]p1:
	// A function that is explicitly defaulted shall
	// -- be a special member function (checked elsewhere),
	// -- have the same type (except for ref-qualifiers, and except that a
	// copy operation can take a non-const reference) as an implicit
	// declaration, and
	// -- not have default arguments.
	// C++2a changes the second bullet to instead delete the function if it's
	// defaulted on its first declaration, unless it's "an assignment operator,
	// and its return type differs or its parameter type is not a reference".
	bool DeleteOnTypeMismatch = getLangOpts().CPlusPlus2a && First;
	bool ShouldDeleteForTypeMismatch = false;
	unsigned ExpectedParams = 1;
	if (CSM == CXXDefaultConstructor \|\| CSM == CXXDestructor)
	ExpectedParams = 0;
	if (MD->getNumParams() != ExpectedParams) {
	// This checks for default arguments: a copy or move constructor with a
	// default argument is classified as a default constructor, and assignment
	// operations and destructors can't have default arguments.
	Diag(MD->getLocation(), diag::err_defaulted_special_member_params)
	<< CSM << MD->getSourceRange();
	HadError = true;
	} else if (MD->isVariadic()) {
	if (DeleteOnTypeMismatch)
	ShouldDeleteForTypeMismatch = true;
	else {
	Diag(MD->getLocation(), diag::err_defaulted_special_member_variadic)
	<< CSM << MD->getSourceRange();
	HadError = true;
	}
	}

	const FunctionProtoType *Type = MD->getType()->getAs<FunctionProtoType>();

	bool CanHaveConstParam = false;
	if (CSM == CXXCopyConstructor)
	CanHaveConstParam = RD->implicitCopyConstructorHasConstParam();
	else if (CSM == CXXCopyAssignment)
	CanHaveConstParam = RD->implicitCopyAssignmentHasConstParam();

	QualType ReturnType = Context.VoidTy;
	if (CSM == CXXCopyAssignment \|\| CSM == CXXMoveAssignment) {
	// Check for return type matching.
	ReturnType = Type->getReturnType();

	QualType DeclType = Context.getTypeDeclType(RD);
	DeclType = Context.getAddrSpaceQualType(DeclType, MD->getMethodQualifiers().getAddressSpace());
	QualType ExpectedReturnType = Context.getLValueReferenceType(DeclType);

	if (!Context.hasSameType(ReturnType, ExpectedReturnType)) {
	Diag(MD->getLocation(), diag::err_defaulted_special_member_return_type)
	<< (CSM == CXXMoveAssignment) << ExpectedReturnType;
	HadError = true;
	}

	// A defaulted special member cannot have cv-qualifiers.
	if (Type->getMethodQuals().hasConst() \|\| Type->getMethodQuals().hasVolatile()) {
	if (DeleteOnTypeMismatch)
	ShouldDeleteForTypeMismatch = true;
	else {
	Diag(MD->getLocation(), diag::err_defaulted_special_member_quals)
	<< (CSM == CXXMoveAssignment) << getLangOpts().CPlusPlus14;
	HadError = true;
	}
	}
	}

	// Check for parameter type matching.
	QualType ArgType = ExpectedParams ? Type->getParamType(0) : QualType();
	bool HasConstParam = false;
	if (ExpectedParams && ArgType->isReferenceType()) {
	// Argument must be reference to possibly-const T.
	QualType ReferentType = ArgType->getPointeeType();
	HasConstParam = ReferentType.isConstQualified();

	if (ReferentType.isVolatileQualified()) {
	if (DeleteOnTypeMismatch)
	ShouldDeleteForTypeMismatch = true;
	else {
	Diag(MD->getLocation(),
	diag::err_defaulted_special_member_volatile_param) << CSM;
	HadError = true;
	}
	}

	if (HasConstParam && !CanHaveConstParam) {
	if (DeleteOnTypeMismatch)
	ShouldDeleteForTypeMismatch = true;
	else if (CSM == CXXCopyConstructor \|\| CSM == CXXCopyAssignment) {
	Diag(MD->getLocation(),
	diag::err_defaulted_special_member_copy_const_param)
	<< (CSM == CXXCopyAssignment);
	// FIXME: Explain why this special member can't be const.
	HadError = true;
	} else {
	Diag(MD->getLocation(),
	diag::err_defaulted_special_member_move_const_param)
	<< (CSM == CXXMoveAssignment);
	HadError = true;
	}
	}
	} else if (ExpectedParams) {
	// A copy assignment operator can take its argument by value, but a
	// defaulted one cannot.
	assert(CSM == CXXCopyAssignment && "unexpected non-ref argument");
	Diag(MD->getLocation(), diag::err_defaulted_copy_assign_not_ref);
	HadError = true;
	}

	// C++11 [dcl.fct.def.default]p2:
	// An explicitly-defaulted function may be declared constexpr only if it
	// would have been implicitly declared as constexpr,
	// Do not apply this rule to members of class templates, since core issue 1358
	// makes such functions always instantiate to constexpr functions. For
	// functions which cannot be constexpr (for non-constructors in C++11 and for
	// destructors in C++1y), this is checked elsewhere.
	//
	// FIXME: This should not apply if the member is deleted.
	bool Constexpr = defaultedSpecialMemberIsConstexpr(*this, RD, CSM,
	HasConstParam);
	if ((getLangOpts().CPlusPlus14 ? !isa<CXXDestructorDecl>(MD)
	: isa<CXXConstructorDecl>(MD)) &&
	MD->isConstexpr() && !Constexpr &&
	MD->getTemplatedKind() == FunctionDecl::TK_NonTemplate) {
	Diag(MD->getBeginLoc(), MD->isConsteval()
	? diag::err_incorrect_defaulted_consteval
	: diag::err_incorrect_defaulted_constexpr)
	<< CSM;
	// FIXME: Explain why the special member can't be constexpr.
	HadError = true;
	}

	if (First) {
	// C++2a [dcl.fct.def.default]p3:
	// If a function is explicitly defaulted on its first declaration, it is
	// implicitly considered to be constexpr if the implicit declaration
	// would be.
	MD->setConstexprKind(Constexpr ? CSK_constexpr : CSK_unspecified);

	if (!Type->hasExceptionSpec()) {
	// C++2a [except.spec]p3:
	// If a declaration of a function does not have a noexcept-specifier
	// [and] is defaulted on its first declaration, [...] the exception
	// specification is as specified below
	FunctionProtoType::ExtProtoInfo EPI = Type->getExtProtoInfo();
	EPI.ExceptionSpec.Type = EST_Unevaluated;
	EPI.ExceptionSpec.SourceDecl = MD;
	MD->setType(Context.getFunctionType(ReturnType,
	llvm::makeArrayRef(&ArgType,
	ExpectedParams),
	EPI));
	}
	}

	if (ShouldDeleteForTypeMismatch \|\| ShouldDeleteSpecialMember(MD, CSM)) {
	if (First) {
	SetDeclDeleted(MD, MD->getLocation());
	if (!inTemplateInstantiation() && !HadError) {
	Diag(MD->getLocation(), diag::warn_defaulted_method_deleted) << CSM;
	if (ShouldDeleteForTypeMismatch) {
	Diag(MD->getLocation(), diag::note_deleted_type_mismatch) << CSM;
	} else {
	ShouldDeleteSpecialMember(MD, CSM, nullptr, /Diagnose/true);
	}
	}
	if (ShouldDeleteForTypeMismatch && !HadError) {
	Diag(MD->getLocation(),
	diag::warn_cxx17_compat_defaulted_method_type_mismatch) << CSM;
	}
	} else {
	// C++11 [dcl.fct.def.default]p4:
	// [For a] user-provided explicitly-defaulted function [...] if such a
	// function is implicitly defined as deleted, the program is ill-formed.
	Diag(MD->getLocation(), diag::err_out_of_line_default_deletes) << CSM;
	assert(!ShouldDeleteForTypeMismatch && "deleted non-first decl");
	ShouldDeleteSpecialMember(MD, CSM, nullptr, /Diagnose/true);
	HadError = true;
	}
	}

	if (HadError)
	MD->setInvalidDecl();
	}

	void Sema::CheckDelayedMemberExceptionSpecs() {
	decltype(DelayedOverridingExceptionSpecChecks) Overriding;
	decltype(DelayedEquivalentExceptionSpecChecks) Equivalent;

	std::swap(Overriding, DelayedOverridingExceptionSpecChecks);
	std::swap(Equivalent, DelayedEquivalentExceptionSpecChecks);

	// Perform any deferred checking of exception specifications for virtual
	// destructors.
	for (auto &Check : Overriding)
	CheckOverridingFunctionExceptionSpec(Check.first, Check.second);

	// Perform any deferred checking of exception specifications for befriended
	// special members.
	for (auto &Check : Equivalent)
	CheckEquivalentExceptionSpec(Check.second, Check.first);
	}

	namespace {
	/// CRTP base class for visiting operations performed by a special member
	/// function (or inherited constructor).
	template<typename Derived>
	struct SpecialMemberVisitor {
	Sema &S;
	CXXMethodDecl *MD;
	Sema::CXXSpecialMember CSM;
	Sema::InheritedConstructorInfo *ICI;

	// Properties of the special member, computed for convenience.
	bool IsConstructor = false, IsAssignment = false, ConstArg = false;

	SpecialMemberVisitor(Sema &S, CXXMethodDecl *MD, Sema::CXXSpecialMember CSM,
	Sema::InheritedConstructorInfo *ICI)
	: S(S), MD(MD), CSM(CSM), ICI(ICI) {
	switch (CSM) {
	case Sema::CXXDefaultConstructor:
	case Sema::CXXCopyConstructor:
	case Sema::CXXMoveConstructor:
	IsConstructor = true;
	break;
	case Sema::CXXCopyAssignment:
	case Sema::CXXMoveAssignment:
	IsAssignment = true;
	break;
	case Sema::CXXDestructor:
	break;
	case Sema::CXXInvalid:
	llvm_unreachable("invalid special member kind");
	}

	if (MD->getNumParams()) {
	if (const ReferenceType *RT =
	MD->getParamDecl(0)->getType()->getAs<ReferenceType>())
	ConstArg = RT->getPointeeType().isConstQualified();
	}
	}

	Derived &getDerived() { return static_cast<Derived&>(*this); }

	/// Is this a "move" special member?
	bool isMove() const {
	return CSM == Sema::CXXMoveConstructor \|\| CSM == Sema::CXXMoveAssignment;
	}

	/// Look up the corresponding special member in the given class.
	Sema::SpecialMemberOverloadResult lookupIn(CXXRecordDecl *Class,
	unsigned Quals, bool IsMutable) {
	return lookupCallFromSpecialMember(S, Class, CSM, Quals,
	ConstArg && !IsMutable);
	}

	/// Look up the constructor for the specified base class to see if it's
	/// overridden due to this being an inherited constructor.
	Sema::SpecialMemberOverloadResult lookupInheritedCtor(CXXRecordDecl *Class) {
	if (!ICI)
	return {};
	assert(CSM == Sema::CXXDefaultConstructor);
	auto *BaseCtor =
	cast<CXXConstructorDecl>(MD)->getInheritedConstructor().getConstructor();
	if (auto *MD = ICI->findConstructorForBase(Class, BaseCtor).first)
	return MD;
	return {};
	}

	/// A base or member subobject.
	typedef llvm::PointerUnion<CXXBaseSpecifier, FieldDecl> Subobject;

	/// Get the location to use for a subobject in diagnostics.
	static SourceLocation getSubobjectLoc(Subobject Subobj) {
	// FIXME: For an indirect virtual base, the direct base leading to
	// the indirect virtual base would be a more useful choice.
	if (auto B = Subobj.dyn_cast<CXXBaseSpecifier>())
	return B->getBaseTypeLoc();
	else
	return Subobj.get<FieldDecl*>()->getLocation();
	}

	enum BasesToVisit {
	/// Visit all non-virtual (direct) bases.
	VisitNonVirtualBases,
	/// Visit all direct bases, virtual or not.
	VisitDirectBases,
	/// Visit all non-virtual bases, and all virtual bases if the class
	/// is not abstract.
	VisitPotentiallyConstructedBases,
	/// Visit all direct or virtual bases.
	VisitAllBases
	};

	// Visit the bases and members of the class.
	bool visit(BasesToVisit Bases) {
	CXXRecordDecl *RD = MD->getParent();

	if (Bases == VisitPotentiallyConstructedBases)
	Bases = RD->isAbstract() ? VisitNonVirtualBases : VisitAllBases;

	for (auto &B : RD->bases())
	if ((Bases == VisitDirectBases \|\| !B.isVirtual()) &&
	getDerived().visitBase(&B))
	return true;

	if (Bases == VisitAllBases)
	for (auto &B : RD->vbases())
	if (getDerived().visitBase(&B))
	return true;

	for (auto *F : RD->fields())
	if (!F->isInvalidDecl() && !F->isUnnamedBitfield() &&
	getDerived().visitField(F))
	return true;

	return false;
	}
	};
	}

	namespace {
	struct SpecialMemberDeletionInfo
	: SpecialMemberVisitor<SpecialMemberDeletionInfo> {
	bool Diagnose;

	SourceLocation Loc;

	bool AllFieldsAreConst;

	SpecialMemberDeletionInfo(Sema &S, CXXMethodDecl *MD,
	Sema::CXXSpecialMember CSM,
	Sema::InheritedConstructorInfo *ICI, bool Diagnose)
	: SpecialMemberVisitor(S, MD, CSM, ICI), Diagnose(Diagnose),
	Loc(MD->getLocation()), AllFieldsAreConst(true) {}

	bool inUnion() const { return MD->getParent()->isUnion(); }

	Sema::CXXSpecialMember getEffectiveCSM() {
	return ICI ? Sema::CXXInvalid : CSM;
	}

	bool shouldDeleteForVariantObjCPtrMember(FieldDecl *FD, QualType FieldType);

	bool visitBase(CXXBaseSpecifier *Base) { return shouldDeleteForBase(Base); }
	bool visitField(FieldDecl *Field) { return shouldDeleteForField(Field); }

	bool shouldDeleteForBase(CXXBaseSpecifier *Base);
	bool shouldDeleteForField(FieldDecl *FD);
	bool shouldDeleteForAllConstMembers();

	bool shouldDeleteForClassSubobject(CXXRecordDecl *Class, Subobject Subobj,
	unsigned Quals);
	bool shouldDeleteForSubobjectCall(Subobject Subobj,
	Sema::SpecialMemberOverloadResult SMOR,
	bool IsDtorCallInCtor);

	bool isAccessible(Subobject Subobj, CXXMethodDecl *D);
	};
	}

	/// Is the given special member inaccessible when used on the given
	/// sub-object.
	bool SpecialMemberDeletionInfo::isAccessible(Subobject Subobj,
	CXXMethodDecl *target) {
	/// If we're operating on a base class, the object type is the
	/// type of this special member.
	QualType objectTy;
	AccessSpecifier access = target->getAccess();
	if (CXXBaseSpecifier base = Subobj.dyn_cast<CXXBaseSpecifier>()) {
	objectTy = S.Context.getTypeDeclType(MD->getParent());
	access = CXXRecordDecl::MergeAccess(base->getAccessSpecifier(), access);

	// If we're operating on a field, the object type is the type of the field.
	} else {
	objectTy = S.Context.getTypeDeclType(target->getParent());
	}

	return S.isSpecialMemberAccessibleForDeletion(target, access, objectTy);
	}

	/// Check whether we should delete a special member due to the implicit
	/// definition containing a call to a special member of a subobject.
	bool SpecialMemberDeletionInfo::shouldDeleteForSubobjectCall(
	Subobject Subobj, Sema::SpecialMemberOverloadResult SMOR,
	bool IsDtorCallInCtor) {
	CXXMethodDecl *Decl = SMOR.getMethod();
	FieldDecl Field = Subobj.dyn_cast<FieldDecl>();

	int DiagKind = -1;

	if (SMOR.getKind() == Sema::SpecialMemberOverloadResult::NoMemberOrDeleted)
	DiagKind = !Decl ? 0 : 1;
	else if (SMOR.getKind() == Sema::SpecialMemberOverloadResult::Ambiguous)
	DiagKind = 2;
	else if (!isAccessible(Subobj, Decl))
	DiagKind = 3;
	else if (!IsDtorCallInCtor && Field && Field->getParent()->isUnion() &&
	!Decl->isTrivial()) {
	// A member of a union must have a trivial corresponding special member.
	// As a weird special case, a destructor call from a union's constructor
	// must be accessible and non-deleted, but need not be trivial. Such a
	// destructor is never actually called, but is semantically checked as
	// if it were.
	DiagKind = 4;
	}

	if (DiagKind == -1)
	return false;

	if (Diagnose) {
	if (Field) {
	S.Diag(Field->getLocation(),
	diag::note_deleted_special_member_class_subobject)
	<< getEffectiveCSM() << MD->getParent() << /IsField/true
	<< Field << DiagKind << IsDtorCallInCtor << /IsObjCPtr/false;
	} else {
	CXXBaseSpecifier Base = Subobj.get<CXXBaseSpecifier>();
	S.Diag(Base->getBeginLoc(),
	diag::note_deleted_special_member_class_subobject)
	<< getEffectiveCSM() << MD->getParent() << /IsField/ false
	<< Base->getType() << DiagKind << IsDtorCallInCtor
	<< /IsObjCPtr/false;
	}

	if (DiagKind == 1)
	S.NoteDeletedFunction(Decl);
	// FIXME: Explain inaccessibility if DiagKind == 3.
	}

	return true;
	}

	/// Check whether we should delete a special member function due to having a
	/// direct or virtual base class or non-static data member of class type M.
	bool SpecialMemberDeletionInfo::shouldDeleteForClassSubobject(
	CXXRecordDecl *Class, Subobject Subobj, unsigned Quals) {
	FieldDecl Field = Subobj.dyn_cast<FieldDecl>();
	bool IsMutable = Field && Field->isMutable();

	// C++11 [class.ctor]p5:
	// -- any direct or virtual base class, or non-static data member with no
	// brace-or-equal-initializer, has class type M (or array thereof) and
	// either M has no default constructor or overload resolution as applied
	// to M's default constructor results in an ambiguity or in a function
	// that is deleted or inaccessible
	// C++11 [class.copy]p11, C++11 [class.copy]p23:
	// -- a direct or virtual base class B that cannot be copied/moved because
	// overload resolution, as applied to B's corresponding special member,
	// results in an ambiguity or a function that is deleted or inaccessible
	// from the defaulted special member
	// C++11 [class.dtor]p5:
	// -- any direct or virtual base class [...] has a type with a destructor
	// that is deleted or inaccessible
	if (!(CSM == Sema::CXXDefaultConstructor &&
	Field && Field->hasInClassInitializer()) &&
	shouldDeleteForSubobjectCall(Subobj, lookupIn(Class, Quals, IsMutable),
	false))
	return true;

	// C++11 [class.ctor]p5, C++11 [class.copy]p11:
	// -- any direct or virtual base class or non-static data member has a
	// type with a destructor that is deleted or inaccessible
	if (IsConstructor) {
	Sema::SpecialMemberOverloadResult SMOR =
	S.LookupSpecialMember(Class, Sema::CXXDestructor,
	false, false, false, false, false);
	if (shouldDeleteForSubobjectCall(Subobj, SMOR, true))
	return true;
	}

	return false;
	}

	bool SpecialMemberDeletionInfo::shouldDeleteForVariantObjCPtrMember(
	FieldDecl *FD, QualType FieldType) {
	// The defaulted special functions are defined as deleted if this is a variant
	// member with a non-trivial ownership type, e.g., ObjC __strong or __weak
	// type under ARC.
	if (!FieldType.hasNonTrivialObjCLifetime())
	return false;

	// Don't make the defaulted default constructor defined as deleted if the
	// member has an in-class initializer.
	if (CSM == Sema::CXXDefaultConstructor && FD->hasInClassInitializer())
	return false;

	if (Diagnose) {
	auto *ParentClass = cast<CXXRecordDecl>(FD->getParent());
	S.Diag(FD->getLocation(),
	diag::note_deleted_special_member_class_subobject)
	<< getEffectiveCSM() << ParentClass << /IsField/true
	<< FD << 4 << /IsDtorCallInCtor/false << /IsObjCPtr/true;
	}

	return true;
	}

	/// Check whether we should delete a special member function due to the class
	/// having a particular direct or virtual base class.
	bool SpecialMemberDeletionInfo::shouldDeleteForBase(CXXBaseSpecifier *Base) {
	CXXRecordDecl *BaseClass = Base->getType()->getAsCXXRecordDecl();
	// If program is correct, BaseClass cannot be null, but if it is, the error
	// must be reported elsewhere.
	if (!BaseClass)
	return false;
	// If we have an inheriting constructor, check whether we're calling an
	// inherited constructor instead of a default constructor.
	Sema::SpecialMemberOverloadResult SMOR = lookupInheritedCtor(BaseClass);
	if (auto *BaseCtor = SMOR.getMethod()) {
	// Note that we do not check access along this path; other than that,
	// this is the same as shouldDeleteForSubobjectCall(Base, BaseCtor, false);
	// FIXME: Check that the base has a usable destructor! Sink this into
	// shouldDeleteForClassSubobject.
	if (BaseCtor->isDeleted() && Diagnose) {
	S.Diag(Base->getBeginLoc(),
	diag::note_deleted_special_member_class_subobject)
	<< getEffectiveCSM() << MD->getParent() << /IsField/ false
	<< Base->getType() << /Deleted/ 1 << /IsDtorCallInCtor/ false
	<< /IsObjCPtr/false;
	S.NoteDeletedFunction(BaseCtor);
	}
	return BaseCtor->isDeleted();
	}
	return shouldDeleteForClassSubobject(BaseClass, Base, 0);
	}

	/// Check whether we should delete a special member function due to the class
	/// having a particular non-static data member.
	bool SpecialMemberDeletionInfo::shouldDeleteForField(FieldDecl *FD) {
	QualType FieldType = S.Context.getBaseElementType(FD->getType());
	CXXRecordDecl *FieldRecord = FieldType->getAsCXXRecordDecl();

	if (inUnion() && shouldDeleteForVariantObjCPtrMember(FD, FieldType))
	return true;

	if (CSM == Sema::CXXDefaultConstructor) {
	// For a default constructor, all references must be initialized in-class
	// and, if a union, it must have a non-const member.
	if (FieldType->isReferenceType() && !FD->hasInClassInitializer()) {
	if (Diagnose)
	S.Diag(FD->getLocation(), diag::note_deleted_default_ctor_uninit_field)
	<< !!ICI << MD->getParent() << FD << FieldType << /Reference/0;
	return true;
	}
	// C++11 [class.ctor]p5: any non-variant non-static data member of
	// const-qualified type (or array thereof) with no
	// brace-or-equal-initializer does not have a user-provided default
	// constructor.
	if (!inUnion() && FieldType.isConstQualified() &&
	!FD->hasInClassInitializer() &&
	(!FieldRecord \|\| !FieldRecord->hasUserProvidedDefaultConstructor())) {
	if (Diagnose)
	S.Diag(FD->getLocation(), diag::note_deleted_default_ctor_uninit_field)
	<< !!ICI << MD->getParent() << FD << FD->getType() << /Const/1;
	return true;
	}

	if (inUnion() && !FieldType.isConstQualified())
	AllFieldsAreConst = false;
	} else if (CSM == Sema::CXXCopyConstructor) {
	// For a copy constructor, data members must not be of rvalue reference
	// type.
	if (FieldType->isRValueReferenceType()) {
	if (Diagnose)
	S.Diag(FD->getLocation(), diag::note_deleted_copy_ctor_rvalue_reference)
	<< MD->getParent() << FD << FieldType;
	return true;
	}
	} else if (IsAssignment) {
	// For an assignment operator, data members must not be of reference type.
	if (FieldType->isReferenceType()) {
	if (Diagnose)
	S.Diag(FD->getLocation(), diag::note_deleted_assign_field)
	<< isMove() << MD->getParent() << FD << FieldType << /Reference/0;
	return true;
	}
	if (!FieldRecord && FieldType.isConstQualified()) {
	// C++11 [class.copy]p23:
	// -- a non-static data member of const non-class type (or array thereof)
	if (Diagnose)
	S.Diag(FD->getLocation(), diag::note_deleted_assign_field)
	<< isMove() << MD->getParent() << FD << FD->getType() << /Const/1;
	return true;
	}
	}

	if (FieldRecord) {
	// Some additional restrictions exist on the variant members.
	if (!inUnion() && FieldRecord->isUnion() &&
	FieldRecord->isAnonymousStructOrUnion()) {
	bool AllVariantFieldsAreConst = true;

	// FIXME: Handle anonymous unions declared within anonymous unions.
	for (auto *UI : FieldRecord->fields()) {
	QualType UnionFieldType = S.Context.getBaseElementType(UI->getType());

	if (shouldDeleteForVariantObjCPtrMember(&*UI, UnionFieldType))
	return true;

	if (!UnionFieldType.isConstQualified())
	AllVariantFieldsAreConst = false;

	CXXRecordDecl *UnionFieldRecord = UnionFieldType->getAsCXXRecordDecl();
	if (UnionFieldRecord &&
	shouldDeleteForClassSubobject(UnionFieldRecord, UI,
	UnionFieldType.getCVRQualifiers()))
	return true;
	}

	// At least one member in each anonymous union must be non-const
	if (CSM == Sema::CXXDefaultConstructor && AllVariantFieldsAreConst &&
	!FieldRecord->field_empty()) {
	if (Diagnose)
	S.Diag(FieldRecord->getLocation(),
	diag::note_deleted_default_ctor_all_const)
	<< !!ICI << MD->getParent() << /anonymous union/1;
	return true;
	}

	// Don't check the implicit member of the anonymous union type.
	// This is technically non-conformant, but sanity demands it.
	return false;
	}

	if (shouldDeleteForClassSubobject(FieldRecord, FD,
	FieldType.getCVRQualifiers()))
	return true;
	}

	return false;
	}

	/// C++11 [class.ctor] p5:
	/// A defaulted default constructor for a class X is defined as deleted if
	/// X is a union and all of its variant members are of const-qualified type.
	bool SpecialMemberDeletionInfo::shouldDeleteForAllConstMembers() {
	// This is a silly definition, because it gives an empty union a deleted
	// default constructor. Don't do that.
	if (CSM == Sema::CXXDefaultConstructor && inUnion() && AllFieldsAreConst) {
	bool AnyFields = false;
	for (auto *F : MD->getParent()->fields())
	if ((AnyFields = !F->isUnnamedBitfield()))
	break;
	if (!AnyFields)
	return false;
	if (Diagnose)
	S.Diag(MD->getParent()->getLocation(),
	diag::note_deleted_default_ctor_all_const)
	<< !!ICI << MD->getParent() << /not anonymous union/0;
	return true;
	}
	return false;
	}

	/// Determine whether a defaulted special member function should be defined as
	/// deleted, as specified in C++11 [class.ctor]p5, C++11 [class.copy]p11,
	/// C++11 [class.copy]p23, and C++11 [class.dtor]p5.
	bool Sema::ShouldDeleteSpecialMember(CXXMethodDecl *MD, CXXSpecialMember CSM,
	InheritedConstructorInfo *ICI,
	bool Diagnose) {
	if (MD->isInvalidDecl())
	return false;
	CXXRecordDecl *RD = MD->getParent();
	assert(!RD->isDependentType() && "do deletion after instantiation");
	if (!LangOpts.CPlusPlus11 \|\| RD->isInvalidDecl())
	return false;

	// C++11 [expr.lambda.prim]p19:
	// The closure type associated with a lambda-expression has a
	// deleted (8.4.3) default constructor and a deleted copy
	// assignment operator.
	// C++2a adds back these operators if the lambda has no lambda-capture.
	if (RD->isLambda() && !RD->lambdaIsDefaultConstructibleAndAssignable() &&
	(CSM == CXXDefaultConstructor \|\| CSM == CXXCopyAssignment)) {
	if (Diagnose)
	Diag(RD->getLocation(), diag::note_lambda_decl);
	return true;
	}

	// For an anonymous struct or union, the copy and assignment special members
	// will never be used, so skip the check. For an anonymous union declared at
	// namespace scope, the constructor and destructor are used.
	if (CSM != CXXDefaultConstructor && CSM != CXXDestructor &&
	RD->isAnonymousStructOrUnion())
	return false;

	// C++11 [class.copy]p7, p18:
	// If the class definition declares a move constructor or move assignment
	// operator, an implicitly declared copy constructor or copy assignment
	// operator is defined as deleted.
	if (MD->isImplicit() &&
	(CSM == CXXCopyConstructor \|\| CSM == CXXCopyAssignment)) {
	CXXMethodDecl *UserDeclaredMove = nullptr;

	// In Microsoft mode up to MSVC 2013, a user-declared move only causes the
	// deletion of the corresponding copy operation, not both copy operations.
	// MSVC 2015 has adopted the standards conforming behavior.
	bool DeletesOnlyMatchingCopy =
	getLangOpts().MSVCCompat &&
	!getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015);

	if (RD->hasUserDeclaredMoveConstructor() &&
	(!DeletesOnlyMatchingCopy \|\| CSM == CXXCopyConstructor)) {
	if (!Diagnose) return true;

	// Find any user-declared move constructor.
	for (auto *I : RD->ctors()) {
	if (I->isMoveConstructor()) {
	UserDeclaredMove = I;
	break;
	}
	}
	assert(UserDeclaredMove);
	} else if (RD->hasUserDeclaredMoveAssignment() &&
	(!DeletesOnlyMatchingCopy \|\| CSM == CXXCopyAssignment)) {
	if (!Diagnose) return true;

	// Find any user-declared move assignment operator.
	for (auto *I : RD->methods()) {
	if (I->isMoveAssignmentOperator()) {
	UserDeclaredMove = I;
	break;
	}
	}
	assert(UserDeclaredMove);
	}

	if (UserDeclaredMove) {
	Diag(UserDeclaredMove->getLocation(),
	diag::note_deleted_copy_user_declared_move)
	<< (CSM == CXXCopyAssignment) << RD
	<< UserDeclaredMove->isMoveAssignmentOperator();
	return true;
	}
	}

	// Do access control from the special member function
	ContextRAII MethodContext(*this, MD);

	// C++11 [class.dtor]p5:
	// -- for a virtual destructor, lookup of the non-array deallocation function
	// results in an ambiguity or in a function that is deleted or inaccessible
	if (CSM == CXXDestructor && MD->isVirtual()) {
	FunctionDecl *OperatorDelete = nullptr;
	DeclarationName Name =
	Context.DeclarationNames.getCXXOperatorName(OO_Delete);
	if (FindDeallocationFunction(MD->getLocation(), MD->getParent(), Name,
	OperatorDelete, /Diagnose/false)) {
	if (Diagnose)
	Diag(RD->getLocation(), diag::note_deleted_dtor_no_operator_delete);
	return true;
	}
	}

	SpecialMemberDeletionInfo SMI(*this, MD, CSM, ICI, Diagnose);

	// Per DR1611, do not consider virtual bases of constructors of abstract
	// classes, since we are not going to construct them.
	// Per DR1658, do not consider virtual bases of destructors of abstract
	// classes either.
	// Per DR2180, for assignment operators we only assign (and thus only
	// consider) direct bases.
	if (SMI.visit(SMI.IsAssignment ? SMI.VisitDirectBases
	: SMI.VisitPotentiallyConstructedBases))
	return true;

	if (SMI.shouldDeleteForAllConstMembers())
	return true;

	if (getLangOpts().CUDA) {
	// We should delete the special member in CUDA mode if target inference
	// failed.
	// For inherited constructors (non-null ICI), CSM may be passed so that MD
	// is treated as certain special member, which may not reflect what special
	// member MD really is. However inferCUDATargetForImplicitSpecialMember
	// expects CSM to match MD, therefore recalculate CSM.
	assert(ICI \|\| CSM == getSpecialMember(MD));
	auto RealCSM = CSM;
	if (ICI)
	RealCSM = getSpecialMember(MD);

	return inferCUDATargetForImplicitSpecialMember(RD, RealCSM, MD,
	SMI.ConstArg, Diagnose);
	}

	return false;
	}

	/// Perform lookup for a special member of the specified kind, and determine
	/// whether it is trivial. If the triviality can be determined without the
	/// lookup, skip it. This is intended for use when determining whether a
	/// special member of a containing object is trivial, and thus does not ever
	/// perform overload resolution for default constructors.
	///
	/// If \p Selected is not \c NULL, \c *Selected will be filled in with the
	/// member that was most likely to be intended to be trivial, if any.
	///
	/// If \p ForCall is true, look at CXXRecord::HasTrivialSpecialMembersForCall to
	/// determine whether the special member is trivial.
	static bool findTrivialSpecialMember(Sema &S, CXXRecordDecl *RD,
	Sema::CXXSpecialMember CSM, unsigned Quals,
	bool ConstRHS,
	Sema::TrivialABIHandling TAH,
	CXXMethodDecl **Selected) {
	if (Selected)
	*Selected = nullptr;

	switch (CSM) {
	case Sema::CXXInvalid:
	llvm_unreachable("not a special member");

	case Sema::CXXDefaultConstructor:
	// C++11 [class.ctor]p5:
	// A default constructor is trivial if:
	// - all the [direct subobjects] have trivial default constructors
	//
	// Note, no overload resolution is performed in this case.
	if (RD->hasTrivialDefaultConstructor())
	return true;

	if (Selected) {
	// If there's a default constructor which could have been trivial, dig it
	// out. Otherwise, if there's any user-provided default constructor, point
	// to that as an example of why there's not a trivial one.
	CXXConstructorDecl *DefCtor = nullptr;
	if (RD->needsImplicitDefaultConstructor())
	S.DeclareImplicitDefaultConstructor(RD);
	for (auto *CI : RD->ctors()) {
	if (!CI->isDefaultConstructor())
	continue;
	DefCtor = CI;
	if (!DefCtor->isUserProvided())
	break;
	}

	*Selected = DefCtor;
	}

	return false;

	case Sema::CXXDestructor:
	// C++11 [class.dtor]p5:
	// A destructor is trivial if:
	// - all the direct [subobjects] have trivial destructors
	if (RD->hasTrivialDestructor() \|\|
	(TAH == Sema::TAH_ConsiderTrivialABI &&
	RD->hasTrivialDestructorForCall()))
	return true;

	if (Selected) {
	if (RD->needsImplicitDestructor())
	S.DeclareImplicitDestructor(RD);
	*Selected = RD->getDestructor();
	}

	return false;

	case Sema::CXXCopyConstructor:
	// C++11 [class.copy]p12:
	// A copy constructor is trivial if:
	// - the constructor selected to copy each direct [subobject] is trivial
	if (RD->hasTrivialCopyConstructor() \|\|
	(TAH == Sema::TAH_ConsiderTrivialABI &&
	RD->hasTrivialCopyConstructorForCall())) {
	if (Quals == Qualifiers::Const)
	// We must either select the trivial copy constructor or reach an
	// ambiguity; no need to actually perform overload resolution.
	return true;
	} else if (!Selected) {
	return false;
	}
	// In C++98, we are not supposed to perform overload resolution here, but we
	// treat that as a language defect, as suggested on cxx-abi-dev, to treat
	// cases like B as having a non-trivial copy constructor:
	// struct A { template<typename T> A(T&); };
	// struct B { mutable A a; };
	goto NeedOverloadResolution;

	case Sema::CXXCopyAssignment:
	// C++11 [class.copy]p25:
	// A copy assignment operator is trivial if:
	// - the assignment operator selected to copy each direct [subobject] is
	// trivial
	if (RD->hasTrivialCopyAssignment()) {
	if (Quals == Qualifiers::Const)
	return true;
	} else if (!Selected) {
	return false;
	}
	// In C++98, we are not supposed to perform overload resolution here, but we
	// treat that as a language defect.
	goto NeedOverloadResolution;

	case Sema::CXXMoveConstructor:
	case Sema::CXXMoveAssignment:
	NeedOverloadResolution:
	Sema::SpecialMemberOverloadResult SMOR =
	lookupCallFromSpecialMember(S, RD, CSM, Quals, ConstRHS);

	// The standard doesn't describe how to behave if the lookup is ambiguous.
	// We treat it as not making the member non-trivial, just like the standard
	// mandates for the default constructor. This should rarely matter, because
	// the member will also be deleted.
	if (SMOR.getKind() == Sema::SpecialMemberOverloadResult::Ambiguous)
	return true;

	if (!SMOR.getMethod()) {
	assert(SMOR.getKind() ==
	Sema::SpecialMemberOverloadResult::NoMemberOrDeleted);
	return false;
	}

	// We deliberately don't check if we found a deleted special member. We're
	// not supposed to!
	if (Selected)
	*Selected = SMOR.getMethod();

	if (TAH == Sema::TAH_ConsiderTrivialABI &&
	(CSM == Sema::CXXCopyConstructor \|\| CSM == Sema::CXXMoveConstructor))
	return SMOR.getMethod()->isTrivialForCall();
	return SMOR.getMethod()->isTrivial();
	}

	llvm_unreachable("unknown special method kind");
	}

	static CXXConstructorDecl findUserDeclaredCtor(CXXRecordDecl RD) {
	for (auto *CI : RD->ctors())
	if (!CI->isImplicit())
	return CI;

	// Look for constructor templates.
	typedef CXXRecordDecl::specific_decl_iterator<FunctionTemplateDecl> tmpl_iter;
	for (tmpl_iter TI(RD->decls_begin()), TE(RD->decls_end()); TI != TE; ++TI) {
	if (CXXConstructorDecl *CD =
	dyn_cast<CXXConstructorDecl>(TI->getTemplatedDecl()))
	return CD;
	}

	return nullptr;
	}

	/// The kind of subobject we are checking for triviality. The values of this
	/// enumeration are used in diagnostics.
	enum TrivialSubobjectKind {
	/// The subobject is a base class.
	TSK_BaseClass,
	/// The subobject is a non-static data member.
	TSK_Field,
	/// The object is actually the complete object.
	TSK_CompleteObject
	};

	/// Check whether the special member selected for a given type would be trivial.
	static bool checkTrivialSubobjectCall(Sema &S, SourceLocation SubobjLoc,
	QualType SubType, bool ConstRHS,
	Sema::CXXSpecialMember CSM,
	TrivialSubobjectKind Kind,
	Sema::TrivialABIHandling TAH, bool Diagnose) {
	CXXRecordDecl *SubRD = SubType->getAsCXXRecordDecl();
	if (!SubRD)
	return true;

	CXXMethodDecl *Selected;
	if (findTrivialSpecialMember(S, SubRD, CSM, SubType.getCVRQualifiers(),
	ConstRHS, TAH, Diagnose ? &Selected : nullptr))
	return true;

	if (Diagnose) {
	if (ConstRHS)
	SubType.addConst();

	if (!Selected && CSM == Sema::CXXDefaultConstructor) {
	S.Diag(SubobjLoc, diag::note_nontrivial_no_def_ctor)
	<< Kind << SubType.getUnqualifiedType();
	if (CXXConstructorDecl *CD = findUserDeclaredCtor(SubRD))
	S.Diag(CD->getLocation(), diag::note_user_declared_ctor);
	} else if (!Selected)
	S.Diag(SubobjLoc, diag::note_nontrivial_no_copy)
	<< Kind << SubType.getUnqualifiedType() << CSM << SubType;
	else if (Selected->isUserProvided()) {
	if (Kind == TSK_CompleteObject)
	S.Diag(Selected->getLocation(), diag::note_nontrivial_user_provided)
	<< Kind << SubType.getUnqualifiedType() << CSM;
	else {
	S.Diag(SubobjLoc, diag::note_nontrivial_user_provided)
	<< Kind << SubType.getUnqualifiedType() << CSM;
	S.Diag(Selected->getLocation(), diag::note_declared_at);
	}
	} else {
	if (Kind != TSK_CompleteObject)
	S.Diag(SubobjLoc, diag::note_nontrivial_subobject)
	<< Kind << SubType.getUnqualifiedType() << CSM;

	// Explain why the defaulted or deleted special member isn't trivial.
	S.SpecialMemberIsTrivial(Selected, CSM, Sema::TAH_IgnoreTrivialABI,
	Diagnose);
	}
	}

	return false;
	}

	/// Check whether the members of a class type allow a special member to be
	/// trivial.
	static bool checkTrivialClassMembers(Sema &S, CXXRecordDecl *RD,
	Sema::CXXSpecialMember CSM,
	bool ConstArg,
	Sema::TrivialABIHandling TAH,
	bool Diagnose) {
	for (const auto *FI : RD->fields()) {
	if (FI->isInvalidDecl() \|\| FI->isUnnamedBitfield())
	continue;

	QualType FieldType = S.Context.getBaseElementType(FI->getType());

	// Pretend anonymous struct or union members are members of this class.
	if (FI->isAnonymousStructOrUnion()) {
	if (!checkTrivialClassMembers(S, FieldType->getAsCXXRecordDecl(),
	CSM, ConstArg, TAH, Diagnose))
	return false;
	continue;
	}

	// C++11 [class.ctor]p5:
	// A default constructor is trivial if [...]
	// -- no non-static data member of its class has a
	// brace-or-equal-initializer
	if (CSM == Sema::CXXDefaultConstructor && FI->hasInClassInitializer()) {
	if (Diagnose)
	S.Diag(FI->getLocation(), diag::note_nontrivial_in_class_init) << FI;
	return false;
	}

	// Objective C ARC 4.3.5:
	// [...] nontrivally ownership-qualified types are [...] not trivially
	// default constructible, copy constructible, move constructible, copy
	// assignable, move assignable, or destructible [...]
	if (FieldType.hasNonTrivialObjCLifetime()) {
	if (Diagnose)
	S.Diag(FI->getLocation(), diag::note_nontrivial_objc_ownership)
	<< RD << FieldType.getObjCLifetime();
	return false;
	}

	bool ConstRHS = ConstArg && !FI->isMutable();
	if (!checkTrivialSubobjectCall(S, FI->getLocation(), FieldType, ConstRHS,
	CSM, TSK_Field, TAH, Diagnose))
	return false;
	}

	return true;
	}

	/// Diagnose why the specified class does not have a trivial special member of
	/// the given kind.
	void Sema::DiagnoseNontrivial(const CXXRecordDecl *RD, CXXSpecialMember CSM) {
	QualType Ty = Context.getRecordType(RD);

	bool ConstArg = (CSM == CXXCopyConstructor \|\| CSM == CXXCopyAssignment);
	checkTrivialSubobjectCall(*this, RD->getLocation(), Ty, ConstArg, CSM,
	TSK_CompleteObject, TAH_IgnoreTrivialABI,
	/Diagnose/true);
	}

	/// Determine whether a defaulted or deleted special member function is trivial,
	/// as specified in C++11 [class.ctor]p5, C++11 [class.copy]p12,
	/// C++11 [class.copy]p25, and C++11 [class.dtor]p5.
	bool Sema::SpecialMemberIsTrivial(CXXMethodDecl *MD, CXXSpecialMember CSM,
	TrivialABIHandling TAH, bool Diagnose) {
	assert(!MD->isUserProvided() && CSM != CXXInvalid && "not special enough");

	CXXRecordDecl *RD = MD->getParent();

	bool ConstArg = false;

	// C++11 [class.copy]p12, p25: [DR1593]
	// A [special member] is trivial if [...] its parameter-type-list is
	// equivalent to the parameter-type-list of an implicit declaration [...]
	switch (CSM) {
	case CXXDefaultConstructor:
	case CXXDestructor:
	// Trivial default constructors and destructors cannot have parameters.
	break;

	case CXXCopyConstructor:
	case CXXCopyAssignment: {
	// Trivial copy operations always have const, non-volatile parameter types.
	ConstArg = true;
	const ParmVarDecl *Param0 = MD->getParamDecl(0);
	const ReferenceType *RT = Param0->getType()->getAs<ReferenceType>();
	if (!RT \|\| RT->getPointeeType().getCVRQualifiers() != Qualifiers::Const) {
	if (Diagnose)
	Diag(Param0->getLocation(), diag::note_nontrivial_param_type)
	<< Param0->getSourceRange() << Param0->getType()
	<< Context.getLValueReferenceType(
	Context.getRecordType(RD).withConst());
	return false;
	}
	break;
	}

	case CXXMoveConstructor:
	case CXXMoveAssignment: {
	// Trivial move operations always have non-cv-qualified parameters.
	const ParmVarDecl *Param0 = MD->getParamDecl(0);
	const RValueReferenceType *RT =
	Param0->getType()->getAs<RValueReferenceType>();
	if (!RT \|\| RT->getPointeeType().getCVRQualifiers()) {
	if (Diagnose)
	Diag(Param0->getLocation(), diag::note_nontrivial_param_type)
	<< Param0->getSourceRange() << Param0->getType()
	<< Context.getRValueReferenceType(Context.getRecordType(RD));
	return false;
	}
	break;
	}

	case CXXInvalid:
	llvm_unreachable("not a special member");
	}

	if (MD->getMinRequiredArguments() < MD->getNumParams()) {
	if (Diagnose)
	Diag(MD->getParamDecl(MD->getMinRequiredArguments())->getLocation(),
	diag::note_nontrivial_default_arg)
	<< MD->getParamDecl(MD->getMinRequiredArguments())->getSourceRange();
	return false;
	}
	if (MD->isVariadic()) {
	if (Diagnose)
	Diag(MD->getLocation(), diag::note_nontrivial_variadic);
	return false;
	}

	// C++11 [class.ctor]p5, C++11 [class.dtor]p5:
	// A copy/move [constructor or assignment operator] is trivial if
	// -- the [member] selected to copy/move each direct base class subobject
	// is trivial
	//
	// C++11 [class.copy]p12, C++11 [class.copy]p25:
	// A [default constructor or destructor] is trivial if
	// -- all the direct base classes have trivial [default constructors or
	// destructors]
	for (const auto &BI : RD->bases())
	if (!checkTrivialSubobjectCall(*this, BI.getBeginLoc(), BI.getType(),
	ConstArg, CSM, TSK_BaseClass, TAH, Diagnose))
	return false;

	// C++11 [class.ctor]p5, C++11 [class.dtor]p5:
	// A copy/move [constructor or assignment operator] for a class X is
	// trivial if
	// -- for each non-static data member of X that is of class type (or array
	// thereof), the constructor selected to copy/move that member is
	// trivial
	//
	// C++11 [class.copy]p12, C++11 [class.copy]p25:
	// A [default constructor or destructor] is trivial if
	// -- for all of the non-static data members of its class that are of class
	// type (or array thereof), each such class has a trivial [default
	// constructor or destructor]
	if (!checkTrivialClassMembers(*this, RD, CSM, ConstArg, TAH, Diagnose))
	return false;

	// C++11 [class.dtor]p5:
	// A destructor is trivial if [...]
	// -- the destructor is not virtual
	if (CSM == CXXDestructor && MD->isVirtual()) {
	if (Diagnose)
	Diag(MD->getLocation(), diag::note_nontrivial_virtual_dtor) << RD;
	return false;
	}

	// C++11 [class.ctor]p5, C++11 [class.copy]p12, C++11 [class.copy]p25:
	// A [special member] for class X is trivial if [...]
	// -- class X has no virtual functions and no virtual base classes
	if (CSM != CXXDestructor && MD->getParent()->isDynamicClass()) {
	if (!Diagnose)
	return false;

	if (RD->getNumVBases()) {
	// Check for virtual bases. We already know that the corresponding
	// member in all bases is trivial, so vbases must all be direct.
	CXXBaseSpecifier &BS = *RD->vbases_begin();
	assert(BS.isVirtual());
	Diag(BS.getBeginLoc(), diag::note_nontrivial_has_virtual) << RD << 1;
	return false;
	}

	// Must have a virtual method.
	for (const auto *MI : RD->methods()) {
	if (MI->isVirtual()) {
	SourceLocation MLoc = MI->getBeginLoc();
	Diag(MLoc, diag::note_nontrivial_has_virtual) << RD << 0;
	return false;
	}
	}

	llvm_unreachable("dynamic class with no vbases and no virtual functions");
	}

	// Looks like it's trivial!
	return true;
	}

	namespace {
	struct FindHiddenVirtualMethod {
	Sema *S;
	CXXMethodDecl *Method;
	llvm::SmallPtrSet<const CXXMethodDecl *, 8> OverridenAndUsingBaseMethods;
	SmallVector<CXXMethodDecl *, 8> OverloadedMethods;

	private:
	/// Check whether any most overridden method from MD in Methods
	static bool CheckMostOverridenMethods(
	const CXXMethodDecl *MD,
	const llvm::SmallPtrSetImpl<const CXXMethodDecl *> &Methods) {
	if (MD->size_overridden_methods() == 0)
	return Methods.count(MD->getCanonicalDecl());
	for (const CXXMethodDecl *O : MD->overridden_methods())
	if (CheckMostOverridenMethods(O, Methods))
	return true;
	return false;
	}

	public:
	/// Member lookup function that determines whether a given C++
	/// method overloads virtual methods in a base class without overriding any,
	/// to be used with CXXRecordDecl::lookupInBases().
	bool operator()(const CXXBaseSpecifier *Specifier, CXXBasePath &Path) {
	RecordDecl *BaseRecord =
	Specifier->getType()->getAs<RecordType>()->getDecl();

	DeclarationName Name = Method->getDeclName();
	assert(Name.getNameKind() == DeclarationName::Identifier);

	bool foundSameNameMethod = false;
	SmallVector<CXXMethodDecl *, 8> overloadedMethods;
	for (Path.Decls = BaseRecord->lookup(Name); !Path.Decls.empty();
	Path.Decls = Path.Decls.slice(1)) {
	NamedDecl *D = Path.Decls.front();
	if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(D)) {
	MD = MD->getCanonicalDecl();
	foundSameNameMethod = true;
	// Interested only in hidden virtual methods.
	if (!MD->isVirtual())
	continue;
	// If the method we are checking overrides a method from its base
	// don't warn about the other overloaded methods. Clang deviates from
	// GCC by only diagnosing overloads of inherited virtual functions that
	// do not override any other virtual functions in the base. GCC's
	// -Woverloaded-virtual diagnoses any derived function hiding a virtual
	// function from a base class. These cases may be better served by a
	// warning (not specific to virtual functions) on call sites when the
	// call would select a different function from the base class, were it
	// visible.
	// See FIXME in test/SemaCXX/warn-overload-virtual.cpp for an example.
	if (!S->IsOverload(Method, MD, false))
	return true;
	// Collect the overload only if its hidden.
	if (!CheckMostOverridenMethods(MD, OverridenAndUsingBaseMethods))
	overloadedMethods.push_back(MD);
	}
	}

	if (foundSameNameMethod)
	OverloadedMethods.append(overloadedMethods.begin(),
	overloadedMethods.end());
	return foundSameNameMethod;
	}
	};
	} // end anonymous namespace

	/// Add the most overriden methods from MD to Methods
	static void AddMostOverridenMethods(const CXXMethodDecl *MD,
	llvm::SmallPtrSetImpl<const CXXMethodDecl *>& Methods) {
	if (MD->size_overridden_methods() == 0)
	Methods.insert(MD->getCanonicalDecl());
	else
	for (const CXXMethodDecl *O : MD->overridden_methods())
	AddMostOverridenMethods(O, Methods);
	}

	/// Check if a method overloads virtual methods in a base class without
	/// overriding any.
	void Sema::FindHiddenVirtualMethods(CXXMethodDecl *MD,
	SmallVectorImpl<CXXMethodDecl*> &OverloadedMethods) {
	if (!MD->getDeclName().isIdentifier())
	return;

	CXXBasePaths Paths(/FindAmbiguities=/true, // true to look in all bases.
	/bool RecordPaths=/false,
	/bool DetectVirtual=/false);
	FindHiddenVirtualMethod FHVM;
	FHVM.Method = MD;
	FHVM.S = this;

	// Keep the base methods that were overridden or introduced in the subclass
	// by 'using' in a set. A base method not in this set is hidden.
	CXXRecordDecl *DC = MD->getParent();
	DeclContext::lookup_result R = DC->lookup(MD->getDeclName());
	for (DeclContext::lookup_iterator I = R.begin(), E = R.end(); I != E; ++I) {
	NamedDecl ND = I;
	if (UsingShadowDecl shad = dyn_cast<UsingShadowDecl>(I))
	ND = shad->getTargetDecl();
	if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(ND))
	AddMostOverridenMethods(MD, FHVM.OverridenAndUsingBaseMethods);
	}

	if (DC->lookupInBases(FHVM, Paths))
	OverloadedMethods = FHVM.OverloadedMethods;
	}

	void Sema::NoteHiddenVirtualMethods(CXXMethodDecl *MD,
	SmallVectorImpl<CXXMethodDecl*> &OverloadedMethods) {
	for (unsigned i = 0, e = OverloadedMethods.size(); i != e; ++i) {
	CXXMethodDecl *overloadedMD = OverloadedMethods[i];
	PartialDiagnostic PD = PDiag(
	diag::note_hidden_overloaded_virtual_declared_here) << overloadedMD;
	HandleFunctionTypeMismatch(PD, MD->getType(), overloadedMD->getType());
	Diag(overloadedMD->getLocation(), PD);
	}
	}

	/// Diagnose methods which overload virtual methods in a base class
	/// without overriding any.
	void Sema::DiagnoseHiddenVirtualMethods(CXXMethodDecl *MD) {
	if (MD->isInvalidDecl())
	return;

	if (Diags.isIgnored(diag::warn_overloaded_virtual, MD->getLocation()))
	return;

	SmallVector<CXXMethodDecl *, 8> OverloadedMethods;
	FindHiddenVirtualMethods(MD, OverloadedMethods);
	if (!OverloadedMethods.empty()) {
	Diag(MD->getLocation(), diag::warn_overloaded_virtual)
	<< MD << (OverloadedMethods.size() > 1);

	NoteHiddenVirtualMethods(MD, OverloadedMethods);
	}
	}

	void Sema::checkIllFormedTrivialABIStruct(CXXRecordDecl &RD) {
	auto PrintDiagAndRemoveAttr = [&]() {
	// No diagnostics if this is a template instantiation.
	if (!isTemplateInstantiation(RD.getTemplateSpecializationKind()))
	Diag(RD.getAttr<TrivialABIAttr>()->getLocation(),
	diag::ext_cannot_use_trivial_abi) << &RD;
	RD.dropAttr<TrivialABIAttr>();
	};

	// Ill-formed if the struct has virtual functions.
	if (RD.isPolymorphic()) {
	PrintDiagAndRemoveAttr();
	return;
	}

	for (const auto &B : RD.bases()) {
	// Ill-formed if the base class is non-trivial for the purpose of calls or a
	// virtual base.
	if ((!B.getType()->isDependentType() &&
	!B.getType()->getAsCXXRecordDecl()->canPassInRegisters()) \|\|
	B.isVirtual()) {
	PrintDiagAndRemoveAttr();
	return;
	}
	}

	for (const auto *FD : RD.fields()) {
	// Ill-formed if the field is an ObjectiveC pointer or of a type that is
	// non-trivial for the purpose of calls.
	QualType FT = FD->getType();
	if (FT.getObjCLifetime() == Qualifiers::OCL_Weak) {
	PrintDiagAndRemoveAttr();
	return;
	}

	if (const auto *RT = FT->getBaseElementTypeUnsafe()->getAs<RecordType>())
	if (!RT->isDependentType() &&
	!cast<CXXRecordDecl>(RT->getDecl())->canPassInRegisters()) {
	PrintDiagAndRemoveAttr();
	return;
	}
	}
	}

	void Sema::ActOnFinishCXXMemberSpecification(
	Scope S, SourceLocation RLoc, Decl TagDecl, SourceLocation LBrac,
	SourceLocation RBrac, const ParsedAttributesView &AttrList) {
	if (!TagDecl)
	return;

	AdjustDeclIfTemplate(TagDecl);

	for (const ParsedAttr &AL : AttrList) {
	if (AL.getKind() != ParsedAttr::AT_Visibility)
	continue;
	AL.setInvalid();
	Diag(AL.getLoc(), diag::warn_attribute_after_definition_ignored)
	<< AL.getName();
	}

	ActOnFields(S, RLoc, TagDecl, llvm::makeArrayRef(
	// strict aliasing violation!
	reinterpret_cast<Decl**>(FieldCollector->getCurFields()),
	FieldCollector->getCurNumFields()), LBrac, RBrac, AttrList);

	CheckCompletedCXXClass(cast<CXXRecordDecl>(TagDecl));
	}

	/// AddImplicitlyDeclaredMembersToClass - Adds any implicitly-declared
	/// special functions, such as the default constructor, copy
	/// constructor, or destructor, to the given C++ class (C++
	/// [special]p1). This routine can only be executed just before the
	/// definition of the class is complete.
	void Sema::AddImplicitlyDeclaredMembersToClass(CXXRecordDecl *ClassDecl) {
	if (ClassDecl->needsImplicitDefaultConstructor()) {
	++getASTContext().NumImplicitDefaultConstructors;

	if (ClassDecl->hasInheritedConstructor())
	DeclareImplicitDefaultConstructor(ClassDecl);
	}

	if (ClassDecl->needsImplicitCopyConstructor()) {
	++getASTContext().NumImplicitCopyConstructors;

	// If the properties or semantics of the copy constructor couldn't be
	// determined while the class was being declared, force a declaration
	// of it now.
	if (ClassDecl->needsOverloadResolutionForCopyConstructor() \|\|
	ClassDecl->hasInheritedConstructor())
	DeclareImplicitCopyConstructor(ClassDecl);
	// For the MS ABI we need to know whether the copy ctor is deleted. A
	// prerequisite for deleting the implicit copy ctor is that the class has a
	// move ctor or move assignment that is either user-declared or whose
	// semantics are inherited from a subobject. FIXME: We should provide a more
	// direct way for CodeGen to ask whether the constructor was deleted.
	else if (Context.getTargetInfo().getCXXABI().isMicrosoft() &&
	(ClassDecl->hasUserDeclaredMoveConstructor() \|\|
	ClassDecl->needsOverloadResolutionForMoveConstructor() \|\|
	ClassDecl->hasUserDeclaredMoveAssignment() \|\|
	ClassDecl->needsOverloadResolutionForMoveAssignment()))
	DeclareImplicitCopyConstructor(ClassDecl);
	}

	if (getLangOpts().CPlusPlus11 && ClassDecl->needsImplicitMoveConstructor()) {
	++getASTContext().NumImplicitMoveConstructors;

	if (ClassDecl->needsOverloadResolutionForMoveConstructor() \|\|
	ClassDecl->hasInheritedConstructor())
	DeclareImplicitMoveConstructor(ClassDecl);
	}

	if (ClassDecl->needsImplicitCopyAssignment()) {
	++getASTContext().NumImplicitCopyAssignmentOperators;

	// If we have a dynamic class, then the copy assignment operator may be
	// virtual, so we have to declare it immediately. This ensures that, e.g.,
	// it shows up in the right place in the vtable and that we diagnose
	// problems with the implicit exception specification.
	if (ClassDecl->isDynamicClass() \|\|
	ClassDecl->needsOverloadResolutionForCopyAssignment() \|\|
	ClassDecl->hasInheritedAssignment())
	DeclareImplicitCopyAssignment(ClassDecl);
	}

	if (getLangOpts().CPlusPlus11 && ClassDecl->needsImplicitMoveAssignment()) {
	++getASTContext().NumImplicitMoveAssignmentOperators;

	// Likewise for the move assignment operator.
	if (ClassDecl->isDynamicClass() \|\|
	ClassDecl->needsOverloadResolutionForMoveAssignment() \|\|
	ClassDecl->hasInheritedAssignment())
	DeclareImplicitMoveAssignment(ClassDecl);
	}

	if (ClassDecl->needsImplicitDestructor()) {
	++getASTContext().NumImplicitDestructors;

	// If we have a dynamic class, then the destructor may be virtual, so we
	// have to declare the destructor immediately. This ensures that, e.g., it
	// shows up in the right place in the vtable and that we diagnose problems
	// with the implicit exception specification.
	if (ClassDecl->isDynamicClass() \|\|
	ClassDecl->needsOverloadResolutionForDestructor())
	DeclareImplicitDestructor(ClassDecl);
	}
	}

	unsigned Sema::ActOnReenterTemplateScope(Scope S, Decl D) {
	if (!D)
	return 0;

	// The order of template parameters is not important here. All names
	// get added to the same scope.
	SmallVector<TemplateParameterList *, 4> ParameterLists;

	if (TemplateDecl *TD = dyn_cast<TemplateDecl>(D))
	D = TD->getTemplatedDecl();

	if (auto *PSD = dyn_cast<ClassTemplatePartialSpecializationDecl>(D))
	ParameterLists.push_back(PSD->getTemplateParameters());

	if (DeclaratorDecl *DD = dyn_cast<DeclaratorDecl>(D)) {
	for (unsigned i = 0; i < DD->getNumTemplateParameterLists(); ++i)
	ParameterLists.push_back(DD->getTemplateParameterList(i));

	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	if (FunctionTemplateDecl *FTD = FD->getDescribedFunctionTemplate())
	ParameterLists.push_back(FTD->getTemplateParameters());
	}
	}

	if (TagDecl *TD = dyn_cast<TagDecl>(D)) {
	for (unsigned i = 0; i < TD->getNumTemplateParameterLists(); ++i)
	ParameterLists.push_back(TD->getTemplateParameterList(i));

	if (CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(TD)) {
	if (ClassTemplateDecl *CTD = RD->getDescribedClassTemplate())
	ParameterLists.push_back(CTD->getTemplateParameters());
	}
	}

	unsigned Count = 0;
	for (TemplateParameterList *Params : ParameterLists) {
	if (Params->size() > 0)
	// Ignore explicit specializations; they don't contribute to the template
	// depth.
	++Count;
	for (NamedDecl Param : Params) {
	if (Param->getDeclName()) {
	S->AddDecl(Param);
	IdResolver.AddDecl(Param);
	}
	}
	}

	return Count;
	}

	void Sema::ActOnStartDelayedMemberDeclarations(Scope S, Decl RecordD) {
	if (!RecordD) return;
	AdjustDeclIfTemplate(RecordD);
	CXXRecordDecl *Record = cast<CXXRecordDecl>(RecordD);
	PushDeclContext(S, Record);
	}

	void Sema::ActOnFinishDelayedMemberDeclarations(Scope S, Decl RecordD) {
	if (!RecordD) return;
	PopDeclContext();
	}

	/// This is used to implement the constant expression evaluation part of the
	/// attribute enable_if extension. There is nothing in standard C++ which would
	/// require reentering parameters.
	void Sema::ActOnReenterCXXMethodParameter(Scope S, ParmVarDecl Param) {
	if (!Param)
	return;

	S->AddDecl(Param);
	if (Param->getDeclName())
	IdResolver.AddDecl(Param);
	}

	/// ActOnStartDelayedCXXMethodDeclaration - We have completed
	/// parsing a top-level (non-nested) C++ class, and we are now
	/// parsing those parts of the given Method declaration that could
	/// not be parsed earlier (C++ [class.mem]p2), such as default
	/// arguments. This action should enter the scope of the given
	/// Method declaration as if we had just parsed the qualified method
	/// name. However, it should not bring the parameters into scope;
	/// that will be performed by ActOnDelayedCXXMethodParameter.
	void Sema::ActOnStartDelayedCXXMethodDeclaration(Scope S, Decl MethodD) {
	}

	/// ActOnDelayedCXXMethodParameter - We've already started a delayed
	/// C++ method declaration. We're (re-)introducing the given
	/// function parameter into scope for use in parsing later parts of
	/// the method declaration. For example, we could see an
	/// ActOnParamDefaultArgument event for this parameter.
	void Sema::ActOnDelayedCXXMethodParameter(Scope S, Decl ParamD) {
	if (!ParamD)
	return;

	ParmVarDecl *Param = cast<ParmVarDecl>(ParamD);

	// If this parameter has an unparsed default argument, clear it out
	// to make way for the parsed default argument.
	if (Param->hasUnparsedDefaultArg())
	Param->setDefaultArg(nullptr);

	S->AddDecl(Param);
	if (Param->getDeclName())
	IdResolver.AddDecl(Param);
	}

	/// ActOnFinishDelayedCXXMethodDeclaration - We have finished
	/// processing the delayed method declaration for Method. The method
	/// declaration is now considered finished. There may be a separate
	/// ActOnStartOfFunctionDef action later (not necessarily
	/// immediately!) for this method, if it was also defined inside the
	/// class body.
	void Sema::ActOnFinishDelayedCXXMethodDeclaration(Scope S, Decl MethodD) {
	if (!MethodD)
	return;

	AdjustDeclIfTemplate(MethodD);

	FunctionDecl *Method = cast<FunctionDecl>(MethodD);

	// Now that we have our default arguments, check the constructor
	// again. It could produce additional diagnostics or affect whether
	// the class has implicitly-declared destructors, among other
	// things.
	if (CXXConstructorDecl *Constructor = dyn_cast<CXXConstructorDecl>(Method))
	CheckConstructor(Constructor);

	// Check the default arguments, which we may have added.
	if (!Method->isInvalidDecl())
	CheckCXXDefaultArguments(Method);
	}

	// Emit the given diagnostic for each non-address-space qualifier.
	// Common part of CheckConstructorDeclarator and CheckDestructorDeclarator.
	static void checkMethodTypeQualifiers(Sema &S, Declarator &D, unsigned DiagID) {
	const DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
	if (FTI.hasMethodTypeQualifiers() && !D.isInvalidType()) {
	bool DiagOccured = false;
	FTI.MethodQualifiers->forEachQualifier(
	[DiagID, &S, &DiagOccured](DeclSpec::TQ, StringRef QualName,
	SourceLocation SL) {
	// This diagnostic should be emitted on any qualifier except an addr
	// space qualifier. However, forEachQualifier currently doesn't visit
	// addr space qualifiers, so there's no way to write this condition
	// right now; we just diagnose on everything.
	S.Diag(SL, DiagID) << QualName << SourceRange(SL);
	DiagOccured = true;
	});
	if (DiagOccured)
	D.setInvalidType();
	}
	}

	/// CheckConstructorDeclarator - Called by ActOnDeclarator to check
	/// the well-formedness of the constructor declarator @p D with type @p
	/// R. If there are any errors in the declarator, this routine will
	/// emit diagnostics and set the invalid bit to true. In any case, the type
	/// will be updated to reflect a well-formed type for the constructor and
	/// returned.
	QualType Sema::CheckConstructorDeclarator(Declarator &D, QualType R,
	StorageClass &SC) {
	bool isVirtual = D.getDeclSpec().isVirtualSpecified();

	// C++ [class.ctor]p3:
	// A constructor shall not be virtual (10.3) or static (9.4). A
	// constructor can be invoked for a const, volatile or const
	// volatile object. A constructor shall not be declared const,
	// volatile, or const volatile (9.3.2).
	if (isVirtual) {
	if (!D.isInvalidType())
	Diag(D.getIdentifierLoc(), diag::err_constructor_cannot_be)
	<< "virtual" << SourceRange(D.getDeclSpec().getVirtualSpecLoc())
	<< SourceRange(D.getIdentifierLoc());
	D.setInvalidType();
	}
	if (SC == SC_Static) {
	if (!D.isInvalidType())
	Diag(D.getIdentifierLoc(), diag::err_constructor_cannot_be)
	<< "static" << SourceRange(D.getDeclSpec().getStorageClassSpecLoc())
	<< SourceRange(D.getIdentifierLoc());
	D.setInvalidType();
	SC = SC_None;
	}

	if (unsigned TypeQuals = D.getDeclSpec().getTypeQualifiers()) {
	diagnoseIgnoredQualifiers(
	diag::err_constructor_return_type, TypeQuals, SourceLocation(),
	D.getDeclSpec().getConstSpecLoc(), D.getDeclSpec().getVolatileSpecLoc(),
	D.getDeclSpec().getRestrictSpecLoc(),
	D.getDeclSpec().getAtomicSpecLoc());
	D.setInvalidType();
	}

	checkMethodTypeQualifiers(*this, D, diag::err_invalid_qualified_constructor);

	// C++0x [class.ctor]p4:
	// A constructor shall not be declared with a ref-qualifier.
	DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
	if (FTI.hasRefQualifier()) {
	Diag(FTI.getRefQualifierLoc(), diag::err_ref_qualifier_constructor)
	<< FTI.RefQualifierIsLValueRef
	<< FixItHint::CreateRemoval(FTI.getRefQualifierLoc());
	D.setInvalidType();
	}

	// Rebuild the function type "R" without any type qualifiers (in
	// case any of the errors above fired) and with "void" as the
	// return type, since constructors don't have return types.
	const FunctionProtoType *Proto = R->getAs<FunctionProtoType>();
	if (Proto->getReturnType() == Context.VoidTy && !D.isInvalidType())
	return R;

	FunctionProtoType::ExtProtoInfo EPI = Proto->getExtProtoInfo();
	EPI.TypeQuals = Qualifiers();
	EPI.RefQualifier = RQ_None;

	return Context.getFunctionType(Context.VoidTy, Proto->getParamTypes(), EPI);
	}

	/// CheckConstructor - Checks a fully-formed constructor for
	/// well-formedness, issuing any diagnostics required. Returns true if
	/// the constructor declarator is invalid.
	void Sema::CheckConstructor(CXXConstructorDecl *Constructor) {
	CXXRecordDecl *ClassDecl
	= dyn_cast<CXXRecordDecl>(Constructor->getDeclContext());
	if (!ClassDecl)
	return Constructor->setInvalidDecl();

	// C++ [class.copy]p3:
	// A declaration of a constructor for a class X is ill-formed if
	// its first parameter is of type (optionally cv-qualified) X and
	// either there are no other parameters or else all other
	// parameters have default arguments.
	if (!Constructor->isInvalidDecl() &&
	((Constructor->getNumParams() == 1) \|\|
	(Constructor->getNumParams() > 1 &&
	Constructor->getParamDecl(1)->hasDefaultArg())) &&
	Constructor->getTemplateSpecializationKind()
	!= TSK_ImplicitInstantiation) {
	QualType ParamType = Constructor->getParamDecl(0)->getType();
	QualType ClassTy = Context.getTagDeclType(ClassDecl);
	if (Context.getCanonicalType(ParamType).getUnqualifiedType() == ClassTy) {
	SourceLocation ParamLoc = Constructor->getParamDecl(0)->getLocation();
	const char *ConstRef
	= Constructor->getParamDecl(0)->getIdentifier() ? "const &"
	: " const &";
	Diag(ParamLoc, diag::err_constructor_byvalue_arg)
	<< FixItHint::CreateInsertion(ParamLoc, ConstRef);

	// FIXME: Rather that making the constructor invalid, we should endeavor
	// to fix the type.
	Constructor->setInvalidDecl();
	}
	}
	}

	/// CheckDestructor - Checks a fully-formed destructor definition for
	/// well-formedness, issuing any diagnostics required. Returns true
	/// on error.
	bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
	CXXRecordDecl *RD = Destructor->getParent();

	if (!Destructor->getOperatorDelete() && Destructor->isVirtual()) {
	SourceLocation Loc;

	if (!Destructor->isImplicit())
	Loc = Destructor->getLocation();
	else
	Loc = RD->getLocation();

	// If we have a virtual destructor, look up the deallocation function
	if (FunctionDecl *OperatorDelete =
	FindDeallocationFunctionForDestructor(Loc, RD)) {
	Expr *ThisArg = nullptr;

	// If the notional 'delete this' expression requires a non-trivial
	// conversion from 'this' to the type of a destroying operator delete's
	// first parameter, perform that conversion now.
	if (OperatorDelete->isDestroyingOperatorDelete()) {
	QualType ParamType = OperatorDelete->getParamDecl(0)->getType();
	if (!declaresSameEntity(ParamType->getAsCXXRecordDecl(), RD)) {
	// C++ [class.dtor]p13:
	// ... as if for the expression 'delete this' appearing in a
	// non-virtual destructor of the destructor's class.
	ContextRAII SwitchContext(*this, Destructor);
	ExprResult This =
	ActOnCXXThis(OperatorDelete->getParamDecl(0)->getLocation());
	assert(!This.isInvalid() && "couldn't form 'this' expr in dtor?");
	This = PerformImplicitConversion(This.get(), ParamType, AA_Passing);
	if (This.isInvalid()) {
	// FIXME: Register this as a context note so that it comes out
	// in the right order.
	Diag(Loc, diag::note_implicit_delete_this_in_destructor_here);
	return true;
	}
	ThisArg = This.get();
	}
	}

	DiagnoseUseOfDecl(OperatorDelete, Loc);
	MarkFunctionReferenced(Loc, OperatorDelete);
	Destructor->setOperatorDelete(OperatorDelete, ThisArg);
	}
	}

	return false;
	}

	/// CheckDestructorDeclarator - Called by ActOnDeclarator to check
	/// the well-formednes of the destructor declarator @p D with type @p
	/// R. If there are any errors in the declarator, this routine will
	/// emit diagnostics and set the declarator to invalid. Even if this happens,
	/// will be updated to reflect a well-formed type for the destructor and
	/// returned.
	QualType Sema::CheckDestructorDeclarator(Declarator &D, QualType R,
	StorageClass& SC) {
	// C++ [class.dtor]p1:
	// [...] A typedef-name that names a class is a class-name
	// (7.1.3); however, a typedef-name that names a class shall not
	// be used as the identifier in the declarator for a destructor
	// declaration.
	QualType DeclaratorType = GetTypeFromParser(D.getName().DestructorName);
	if (const TypedefType *TT = DeclaratorType->getAs<TypedefType>())
	Diag(D.getIdentifierLoc(), diag::err_destructor_typedef_name)
	<< DeclaratorType << isa<TypeAliasDecl>(TT->getDecl());
	else if (const TemplateSpecializationType *TST =
	DeclaratorType->getAs<TemplateSpecializationType>())
	if (TST->isTypeAlias())
	Diag(D.getIdentifierLoc(), diag::err_destructor_typedef_name)
	<< DeclaratorType << 1;

	// C++ [class.dtor]p2:
	// A destructor is used to destroy objects of its class type. A
	// destructor takes no parameters, and no return type can be
	// specified for it (not even void). The address of a destructor
	// shall not be taken. A destructor shall not be static. A
	// destructor can be invoked for a const, volatile or const
	// volatile object. A destructor shall not be declared const,
	// volatile or const volatile (9.3.2).
	if (SC == SC_Static) {
	if (!D.isInvalidType())
	Diag(D.getIdentifierLoc(), diag::err_destructor_cannot_be)
	<< "static" << SourceRange(D.getDeclSpec().getStorageClassSpecLoc())
	<< SourceRange(D.getIdentifierLoc())
	<< FixItHint::CreateRemoval(D.getDeclSpec().getStorageClassSpecLoc());

	SC = SC_None;
	}
	if (!D.isInvalidType()) {
	// Destructors don't have return types, but the parser will
	// happily parse something like:
	//
	// class X {
	// float ~X();
	// };
	//
	// The return type will be eliminated later.
	if (D.getDeclSpec().hasTypeSpecifier())
	Diag(D.getIdentifierLoc(), diag::err_destructor_return_type)
	<< SourceRange(D.getDeclSpec().getTypeSpecTypeLoc())
	<< SourceRange(D.getIdentifierLoc());
	else if (unsigned TypeQuals = D.getDeclSpec().getTypeQualifiers()) {
	diagnoseIgnoredQualifiers(diag::err_destructor_return_type, TypeQuals,
	SourceLocation(),
	D.getDeclSpec().getConstSpecLoc(),
	D.getDeclSpec().getVolatileSpecLoc(),
	D.getDeclSpec().getRestrictSpecLoc(),
	D.getDeclSpec().getAtomicSpecLoc());
	D.setInvalidType();
	}
	}

	checkMethodTypeQualifiers(*this, D, diag::err_invalid_qualified_destructor);

	// C++0x [class.dtor]p2:
	// A destructor shall not be declared with a ref-qualifier.
	DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
	if (FTI.hasRefQualifier()) {
	Diag(FTI.getRefQualifierLoc(), diag::err_ref_qualifier_destructor)
	<< FTI.RefQualifierIsLValueRef
	<< FixItHint::CreateRemoval(FTI.getRefQualifierLoc());
	D.setInvalidType();
	}

	// Make sure we don't have any parameters.
	if (FTIHasNonVoidParameters(FTI)) {
	Diag(D.getIdentifierLoc(), diag::err_destructor_with_params);

	// Delete the parameters.
	FTI.freeParams();
	D.setInvalidType();
	}

	// Make sure the destructor isn't variadic.
	if (FTI.isVariadic) {
	Diag(D.getIdentifierLoc(), diag::err_destructor_variadic);
	D.setInvalidType();
	}

	// Rebuild the function type "R" without any type qualifiers or
	// parameters (in case any of the errors above fired) and with
	// "void" as the return type, since destructors don't have return
	// types.
	if (!D.isInvalidType())
	return R;

	const FunctionProtoType *Proto = R->getAs<FunctionProtoType>();
	FunctionProtoType::ExtProtoInfo EPI = Proto->getExtProtoInfo();
	EPI.Variadic = false;
	EPI.TypeQuals = Qualifiers();
	EPI.RefQualifier = RQ_None;
	return Context.getFunctionType(Context.VoidTy, None, EPI);
	}

	static void extendLeft(SourceRange &R, SourceRange Before) {
	if (Before.isInvalid())
	return;
	R.setBegin(Before.getBegin());
	if (R.getEnd().isInvalid())
	R.setEnd(Before.getEnd());
	}

	static void extendRight(SourceRange &R, SourceRange After) {
	if (After.isInvalid())
	return;
	if (R.getBegin().isInvalid())
	R.setBegin(After.getBegin());
	R.setEnd(After.getEnd());
	}

	/// CheckConversionDeclarator - Called by ActOnDeclarator to check the
	/// well-formednes of the conversion function declarator @p D with
	/// type @p R. If there are any errors in the declarator, this routine
	/// will emit diagnostics and return true. Otherwise, it will return
	/// false. Either way, the type @p R will be updated to reflect a
	/// well-formed type for the conversion operator.
	void Sema::CheckConversionDeclarator(Declarator &D, QualType &R,
	StorageClass& SC) {
	// C++ [class.conv.fct]p1:
	// Neither parameter types nor return type can be specified. The
	// type of a conversion function (8.3.5) is "function taking no
	// parameter returning conversion-type-id."
	if (SC == SC_Static) {
	if (!D.isInvalidType())
	Diag(D.getIdentifierLoc(), diag::err_conv_function_not_member)
	<< SourceRange(D.getDeclSpec().getStorageClassSpecLoc())
	<< D.getName().getSourceRange();
	D.setInvalidType();
	SC = SC_None;
	}

	TypeSourceInfo *ConvTSI = nullptr;
	QualType ConvType =
	GetTypeFromParser(D.getName().ConversionFunctionId, &ConvTSI);

	const DeclSpec &DS = D.getDeclSpec();
	if (DS.hasTypeSpecifier() && !D.isInvalidType()) {
	// Conversion functions don't have return types, but the parser will
	// happily parse something like:
	//
	// class X {
	// float operator bool();
	// };
	//
	// The return type will be changed later anyway.
	Diag(D.getIdentifierLoc(), diag::err_conv_function_return_type)
	<< SourceRange(DS.getTypeSpecTypeLoc())
	<< SourceRange(D.getIdentifierLoc());
	D.setInvalidType();
	} else if (DS.getTypeQualifiers() && !D.isInvalidType()) {
	// It's also plausible that the user writes type qualifiers in the wrong
	// place, such as:
	// struct S { const operator int(); };
	// FIXME: we could provide a fixit to move the qualifiers onto the
	// conversion type.
	Diag(D.getIdentifierLoc(), diag::err_conv_function_with_complex_decl)
	<< SourceRange(D.getIdentifierLoc()) << 0;
	D.setInvalidType();
	}

	const FunctionProtoType *Proto = R->getAs<FunctionProtoType>();

	// Make sure we don't have any parameters.
	if (Proto->getNumParams() > 0) {
	Diag(D.getIdentifierLoc(), diag::err_conv_function_with_params);

	// Delete the parameters.
	D.getFunctionTypeInfo().freeParams();
	D.setInvalidType();
	} else if (Proto->isVariadic()) {
	Diag(D.getIdentifierLoc(), diag::err_conv_function_variadic);
	D.setInvalidType();
	}

	// Diagnose "&operator bool()" and other such nonsense. This
	// is actually a gcc extension which we don't support.
	if (Proto->getReturnType() != ConvType) {
	bool NeedsTypedef = false;
	SourceRange Before, After;

	// Walk the chunks and extract information on them for our diagnostic.
	bool PastFunctionChunk = false;
	for (auto &Chunk : D.type_objects()) {
	switch (Chunk.Kind) {
	case DeclaratorChunk::Function:
	if (!PastFunctionChunk) {
	if (Chunk.Fun.HasTrailingReturnType) {
	TypeSourceInfo *TRT = nullptr;
	GetTypeFromParser(Chunk.Fun.getTrailingReturnType(), &TRT);
	if (TRT) extendRight(After, TRT->getTypeLoc().getSourceRange());
	}
	PastFunctionChunk = true;
	break;
	}
	LLVM_FALLTHROUGH;
	case DeclaratorChunk::Array:
	NeedsTypedef = true;
	extendRight(After, Chunk.getSourceRange());
	break;

	case DeclaratorChunk::Pointer:
	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pipe:
	extendLeft(Before, Chunk.getSourceRange());
	break;

	case DeclaratorChunk::Paren:
	extendLeft(Before, Chunk.Loc);
	extendRight(After, Chunk.EndLoc);
	break;
	}
	}

	SourceLocation Loc = Before.isValid() ? Before.getBegin() :
	After.isValid() ? After.getBegin() :
	D.getIdentifierLoc();
	auto &&DB = Diag(Loc, diag::err_conv_function_with_complex_decl);
	DB << Before << After;

	if (!NeedsTypedef) {
	DB << /don't need a typedef/0;

	// If we can provide a correct fix-it hint, do so.
	if (After.isInvalid() && ConvTSI) {
	SourceLocation InsertLoc =
	getLocForEndOfToken(ConvTSI->getTypeLoc().getEndLoc());
	DB << FixItHint::CreateInsertion(InsertLoc, " ")
	<< FixItHint::CreateInsertionFromRange(
	InsertLoc, CharSourceRange::getTokenRange(Before))
	<< FixItHint::CreateRemoval(Before);
	}
	} else if (!Proto->getReturnType()->isDependentType()) {
	DB << /typedef/1 << Proto->getReturnType();
	} else if (getLangOpts().CPlusPlus11) {
	DB << /alias template/2 << Proto->getReturnType();
	} else {
	DB << /might not be fixable/3;
	}

	// Recover by incorporating the other type chunks into the result type.
	// Note, this does not change the name of the function. This is compatible
	// with the GCC extension:
	// struct S { &operator int(); } s;
	// int &r = s.operator int(); // ok in GCC
	// S::operator int&() {} // error in GCC, function name is 'operator int'.
	ConvType = Proto->getReturnType();
	}

	// C++ [class.conv.fct]p4:
	// The conversion-type-id shall not represent a function type nor
	// an array type.
	if (ConvType->isArrayType()) {
	Diag(D.getIdentifierLoc(), diag::err_conv_function_to_array);
	ConvType = Context.getPointerType(ConvType);
	D.setInvalidType();
	} else if (ConvType->isFunctionType()) {
	Diag(D.getIdentifierLoc(), diag::err_conv_function_to_function);
	ConvType = Context.getPointerType(ConvType);
	D.setInvalidType();
	}

	// Rebuild the function type "R" without any parameters (in case any
	// of the errors above fired) and with the conversion type as the
	// return type.
	if (D.isInvalidType())
	R = Context.getFunctionType(ConvType, None, Proto->getExtProtoInfo());

	// C++0x explicit conversion operators.
	if (DS.hasExplicitSpecifier() && !getLangOpts().CPlusPlus2a)
	Diag(DS.getExplicitSpecLoc(),
	getLangOpts().CPlusPlus11
	? diag::warn_cxx98_compat_explicit_conversion_functions
	: diag::ext_explicit_conversion_functions)
	<< SourceRange(DS.getExplicitSpecRange());
	}

	/// ActOnConversionDeclarator - Called by ActOnDeclarator to complete
	/// the declaration of the given C++ conversion function. This routine
	/// is responsible for recording the conversion function in the C++
	/// class, if possible.
	Decl Sema::ActOnConversionDeclarator(CXXConversionDecl Conversion) {
	assert(Conversion && "Expected to receive a conversion function declaration");

	CXXRecordDecl *ClassDecl = cast<CXXRecordDecl>(Conversion->getDeclContext());

	// Make sure we aren't redeclaring the conversion function.
	QualType ConvType = Context.getCanonicalType(Conversion->getConversionType());

	// C++ [class.conv.fct]p1:
	// [...] A conversion function is never used to convert a
	// (possibly cv-qualified) object to the (possibly cv-qualified)
	// same object type (or a reference to it), to a (possibly
	// cv-qualified) base class of that type (or a reference to it),
	// or to (possibly cv-qualified) void.
	// FIXME: Suppress this warning if the conversion function ends up being a
	// virtual function that overrides a virtual function in a base class.
	QualType ClassType
	= Context.getCanonicalType(Context.getTypeDeclType(ClassDecl));
	if (const ReferenceType *ConvTypeRef = ConvType->getAs<ReferenceType>())
	ConvType = ConvTypeRef->getPointeeType();
	if (Conversion->getTemplateSpecializationKind() != TSK_Undeclared &&
	Conversion->getTemplateSpecializationKind() != TSK_ExplicitSpecialization)
	/* Suppress diagnostics for instantiations. */;
	else if (ConvType->isRecordType()) {
	ConvType = Context.getCanonicalType(ConvType).getUnqualifiedType();
	if (ConvType == ClassType)
	Diag(Conversion->getLocation(), diag::warn_conv_to_self_not_used)
	<< ClassType;
	else if (IsDerivedFrom(Conversion->getLocation(), ClassType, ConvType))
	Diag(Conversion->getLocation(), diag::warn_conv_to_base_not_used)
	<< ClassType << ConvType;
	} else if (ConvType->isVoidType()) {
	Diag(Conversion->getLocation(), diag::warn_conv_to_void_not_used)
	<< ClassType << ConvType;
	}

	if (FunctionTemplateDecl *ConversionTemplate
	= Conversion->getDescribedFunctionTemplate())
	return ConversionTemplate;

	return Conversion;
	}

	namespace {
	/// Utility class to accumulate and print a diagnostic listing the invalid
	/// specifier(s) on a declaration.
	struct BadSpecifierDiagnoser {
	BadSpecifierDiagnoser(Sema &S, SourceLocation Loc, unsigned DiagID)
	: S(S), Diagnostic(S.Diag(Loc, DiagID)) {}
	~BadSpecifierDiagnoser() {
	Diagnostic << Specifiers;
	}

	template<typename T> void check(SourceLocation SpecLoc, T Spec) {
	return check(SpecLoc, DeclSpec::getSpecifierName(Spec));
	}
	void check(SourceLocation SpecLoc, DeclSpec::TST Spec) {
	return check(SpecLoc,
	DeclSpec::getSpecifierName(Spec, S.getPrintingPolicy()));
	}
	void check(SourceLocation SpecLoc, const char *Spec) {
	if (SpecLoc.isInvalid()) return;
	Diagnostic << SourceRange(SpecLoc, SpecLoc);
	if (!Specifiers.empty()) Specifiers += " ";
	Specifiers += Spec;
	}

	Sema &S;
	Sema::SemaDiagnosticBuilder Diagnostic;
	std::string Specifiers;
	};
	}

	/// Check the validity of a declarator that we parsed for a deduction-guide.
	/// These aren't actually declarators in the grammar, so we need to check that
	/// the user didn't specify any pieces that are not part of the deduction-guide
	/// grammar.
	void Sema::CheckDeductionGuideDeclarator(Declarator &D, QualType &R,
	StorageClass &SC) {
	TemplateName GuidedTemplate = D.getName().TemplateName.get().get();
	TemplateDecl *GuidedTemplateDecl = GuidedTemplate.getAsTemplateDecl();
	assert(GuidedTemplateDecl && "missing template decl for deduction guide");

	// C++ [temp.deduct.guide]p3:
	// A deduction-gide shall be declared in the same scope as the
	// corresponding class template.
	if (!CurContext->getRedeclContext()->Equals(
	GuidedTemplateDecl->getDeclContext()->getRedeclContext())) {
	Diag(D.getIdentifierLoc(), diag::err_deduction_guide_wrong_scope)
	<< GuidedTemplateDecl;
	Diag(GuidedTemplateDecl->getLocation(), diag::note_template_decl_here);
	}

	auto &DS = D.getMutableDeclSpec();
	// We leave 'friend' and 'virtual' to be rejected in the normal way.
	if (DS.hasTypeSpecifier() \|\| DS.getTypeQualifiers() \|\|
	DS.getStorageClassSpecLoc().isValid() \|\| DS.isInlineSpecified() \|\|
	DS.isNoreturnSpecified() \|\| DS.hasConstexprSpecifier()) {
	BadSpecifierDiagnoser Diagnoser(
	*this, D.getIdentifierLoc(),
	diag::err_deduction_guide_invalid_specifier);

	Diagnoser.check(DS.getStorageClassSpecLoc(), DS.getStorageClassSpec());
	DS.ClearStorageClassSpecs();
	SC = SC_None;

	// 'explicit' is permitted.
	Diagnoser.check(DS.getInlineSpecLoc(), "inline");
	Diagnoser.check(DS.getNoreturnSpecLoc(), "_Noreturn");
	Diagnoser.check(DS.getConstexprSpecLoc(), "constexpr");
	DS.ClearConstexprSpec();

	Diagnoser.check(DS.getConstSpecLoc(), "const");
	Diagnoser.check(DS.getRestrictSpecLoc(), "__restrict");
	Diagnoser.check(DS.getVolatileSpecLoc(), "volatile");
	Diagnoser.check(DS.getAtomicSpecLoc(), "_Atomic");
	Diagnoser.check(DS.getUnalignedSpecLoc(), "__unaligned");
	DS.ClearTypeQualifiers();

	Diagnoser.check(DS.getTypeSpecComplexLoc(), DS.getTypeSpecComplex());
	Diagnoser.check(DS.getTypeSpecSignLoc(), DS.getTypeSpecSign());
	Diagnoser.check(DS.getTypeSpecWidthLoc(), DS.getTypeSpecWidth());
	Diagnoser.check(DS.getTypeSpecTypeLoc(), DS.getTypeSpecType());
	DS.ClearTypeSpecType();
	}

	if (D.isInvalidType())
	return;

	// Check the declarator is simple enough.
	bool FoundFunction = false;
	for (const DeclaratorChunk &Chunk : llvm::reverse(D.type_objects())) {
	if (Chunk.Kind == DeclaratorChunk::Paren)
	continue;
	if (Chunk.Kind != DeclaratorChunk::Function \|\| FoundFunction) {
	Diag(D.getDeclSpec().getBeginLoc(),
	diag::err_deduction_guide_with_complex_decl)
	<< D.getSourceRange();
	break;
	}
	if (!Chunk.Fun.hasTrailingReturnType()) {
	Diag(D.getName().getBeginLoc(),
	diag::err_deduction_guide_no_trailing_return_type);
	break;
	}

	// Check that the return type is written as a specialization of
	// the template specified as the deduction-guide's name.
	ParsedType TrailingReturnType = Chunk.Fun.getTrailingReturnType();
	TypeSourceInfo *TSI = nullptr;
	QualType RetTy = GetTypeFromParser(TrailingReturnType, &TSI);
	assert(TSI && "deduction guide has valid type but invalid return type?");
	bool AcceptableReturnType = false;
	bool MightInstantiateToSpecialization = false;
	if (auto RetTST =
	TSI->getTypeLoc().getAs<TemplateSpecializationTypeLoc>()) {
	TemplateName SpecifiedName = RetTST.getTypePtr()->getTemplateName();
	bool TemplateMatches =
	Context.hasSameTemplateName(SpecifiedName, GuidedTemplate);
	if (SpecifiedName.getKind() == TemplateName::Template && TemplateMatches)
	AcceptableReturnType = true;
	else {
	// This could still instantiate to the right type, unless we know it
	// names the wrong class template.
	auto *TD = SpecifiedName.getAsTemplateDecl();
	MightInstantiateToSpecialization = !(TD && isa<ClassTemplateDecl>(TD) &&
	!TemplateMatches);
	}
	} else if (!RetTy.hasQualifiers() && RetTy->isDependentType()) {
	MightInstantiateToSpecialization = true;
	}

	if (!AcceptableReturnType) {
	Diag(TSI->getTypeLoc().getBeginLoc(),
	diag::err_deduction_guide_bad_trailing_return_type)
	<< GuidedTemplate << TSI->getType()
	<< MightInstantiateToSpecialization
	<< TSI->getTypeLoc().getSourceRange();
	}

	// Keep going to check that we don't have any inner declarator pieces (we
	// could still have a function returning a pointer to a function).
	FoundFunction = true;
	}

	if (D.isFunctionDefinition())
	Diag(D.getIdentifierLoc(), diag::err_deduction_guide_defines_function);
	}

	//===----------------------------------------------------------------------===//
	// Namespace Handling
	//===----------------------------------------------------------------------===//

	/// Diagnose a mismatch in 'inline' qualifiers when a namespace is
	/// reopened.
	static void DiagnoseNamespaceInlineMismatch(Sema &S, SourceLocation KeywordLoc,
	SourceLocation Loc,
	IdentifierInfo II, bool IsInline,
	NamespaceDecl *PrevNS) {
	assert(*IsInline != PrevNS->isInline());

	// HACK: Work around a bug in libstdc++4.6's <atomic>, where
	// std::__atomic[0,1,2] are defined as non-inline namespaces, then reopened as
	// inline namespaces, with the intention of bringing names into namespace std.
	//
	// We support this just well enough to get that case working; this is not
	// sufficient to support reopening namespaces as inline in general.
	if (*IsInline && II && II->getName().startswith("__atomic") &&
	S.getSourceManager().isInSystemHeader(Loc)) {
	// Mark all prior declarations of the namespace as inline.
	for (NamespaceDecl *NS = PrevNS->getMostRecentDecl(); NS;
	NS = NS->getPreviousDecl())
	NS->setInline(*IsInline);
	// Patch up the lookup table for the containing namespace. This isn't really
	// correct, but it's good enough for this particular case.
	for (auto *I : PrevNS->decls())
	if (auto *ND = dyn_cast<NamedDecl>(I))
	PrevNS->getParent()->makeDeclVisibleInContext(ND);
	return;
	}

	if (PrevNS->isInline())
	// The user probably just forgot the 'inline', so suggest that it
	// be added back.
	S.Diag(Loc, diag::warn_inline_namespace_reopened_noninline)
	<< FixItHint::CreateInsertion(KeywordLoc, "inline ");
	else
	S.Diag(Loc, diag::err_inline_namespace_mismatch);

	S.Diag(PrevNS->getLocation(), diag::note_previous_definition);
	*IsInline = PrevNS->isInline();
	}

	/// ActOnStartNamespaceDef - This is called at the start of a namespace
	/// definition.
	Decl *Sema::ActOnStartNamespaceDef(
	Scope *NamespcScope, SourceLocation InlineLoc, SourceLocation NamespaceLoc,
	SourceLocation IdentLoc, IdentifierInfo *II, SourceLocation LBrace,
	const ParsedAttributesView &AttrList, UsingDirectiveDecl *&UD) {
	SourceLocation StartLoc = InlineLoc.isValid() ? InlineLoc : NamespaceLoc;
	// For anonymous namespace, take the location of the left brace.
	SourceLocation Loc = II ? IdentLoc : LBrace;
	bool IsInline = InlineLoc.isValid();
	bool IsInvalid = false;
	bool IsStd = false;
	bool AddToKnown = false;
	Scope *DeclRegionScope = NamespcScope->getParent();

	NamespaceDecl *PrevNS = nullptr;
	if (II) {
	// C++ [namespace.def]p2:
	// The identifier in an original-namespace-definition shall not
	// have been previously defined in the declarative region in
	// which the original-namespace-definition appears. The
	// identifier in an original-namespace-definition is the name of
	// the namespace. Subsequently in that declarative region, it is
	// treated as an original-namespace-name.
	//
	// Since namespace names are unique in their scope, and we don't
	// look through using directives, just look for any ordinary names
	// as if by qualified name lookup.
	LookupResult R(*this, II, IdentLoc, LookupOrdinaryName,
	ForExternalRedeclaration);
	LookupQualifiedName(R, CurContext->getRedeclContext());
	NamedDecl *PrevDecl =
	R.isSingleResult() ? R.getRepresentativeDecl() : nullptr;
	PrevNS = dyn_cast_or_null<NamespaceDecl>(PrevDecl);

	if (PrevNS) {
	// This is an extended namespace definition.
	if (IsInline != PrevNS->isInline())
	DiagnoseNamespaceInlineMismatch(*this, NamespaceLoc, Loc, II,
	&IsInline, PrevNS);
	} else if (PrevDecl) {
	// This is an invalid name redefinition.
	Diag(Loc, diag::err_redefinition_different_kind)
	<< II;
	Diag(PrevDecl->getLocation(), diag::note_previous_definition);
	IsInvalid = true;
	// Continue on to push Namespc as current DeclContext and return it.
	} else if (II->isStr("std") &&
	CurContext->getRedeclContext()->isTranslationUnit()) {
	// This is the first "real" definition of the namespace "std", so update
	// our cache of the "std" namespace to point at this definition.
	PrevNS = getStdNamespace();
	IsStd = true;
	AddToKnown = !IsInline;
	} else {
	// We've seen this namespace for the first time.
	AddToKnown = !IsInline;
	}
	} else {
	// Anonymous namespaces.

	// Determine whether the parent already has an anonymous namespace.
	DeclContext *Parent = CurContext->getRedeclContext();
	if (TranslationUnitDecl *TU = dyn_cast<TranslationUnitDecl>(Parent)) {
	PrevNS = TU->getAnonymousNamespace();
	} else {
	NamespaceDecl *ND = cast<NamespaceDecl>(Parent);
	PrevNS = ND->getAnonymousNamespace();
	}

	if (PrevNS && IsInline != PrevNS->isInline())
	DiagnoseNamespaceInlineMismatch(*this, NamespaceLoc, NamespaceLoc, II,
	&IsInline, PrevNS);
	}

	NamespaceDecl *Namespc = NamespaceDecl::Create(Context, CurContext, IsInline,
	StartLoc, Loc, II, PrevNS);
	if (IsInvalid)
	Namespc->setInvalidDecl();

	ProcessDeclAttributeList(DeclRegionScope, Namespc, AttrList);
	AddPragmaAttributes(DeclRegionScope, Namespc);

	// FIXME: Should we be merging attributes?
	if (const VisibilityAttr *Attr = Namespc->getAttr<VisibilityAttr>())
	PushNamespaceVisibilityAttr(Attr, Loc);

	if (IsStd)
	StdNamespace = Namespc;
	if (AddToKnown)
	KnownNamespaces[Namespc] = false;

	if (II) {
	PushOnScopeChains(Namespc, DeclRegionScope);
	} else {
	// Link the anonymous namespace into its parent.
	DeclContext *Parent = CurContext->getRedeclContext();
	if (TranslationUnitDecl *TU = dyn_cast<TranslationUnitDecl>(Parent)) {
	TU->setAnonymousNamespace(Namespc);
	} else {
	cast<NamespaceDecl>(Parent)->setAnonymousNamespace(Namespc);
	}

	CurContext->addDecl(Namespc);

	// C++ [namespace.unnamed]p1. An unnamed-namespace-definition
	// behaves as if it were replaced by
	// namespace unique { /* empty body */ }
	// using namespace unique;
	// namespace unique { namespace-body }
	// where all occurrences of 'unique' in a translation unit are
	// replaced by the same identifier and this identifier differs
	// from all other identifiers in the entire program.

	// We just create the namespace with an empty name and then add an
	// implicit using declaration, just like the standard suggests.
	//
	// CodeGen enforces the "universally unique" aspect by giving all
	// declarations semantically contained within an anonymous
	// namespace internal linkage.

	if (!PrevNS) {
	UD = UsingDirectiveDecl::Create(Context, Parent,
	/* 'using' */ LBrace,
	/* 'namespace' */ SourceLocation(),
	/* qualifier */ NestedNameSpecifierLoc(),
	/* identifier */ SourceLocation(),
	Namespc,
	/* Ancestor */ Parent);
	UD->setImplicit();
	Parent->addDecl(UD);
	}
	}

	ActOnDocumentableDecl(Namespc);

	// Although we could have an invalid decl (i.e. the namespace name is a
	// redefinition), push it as current DeclContext and try to continue parsing.
	// FIXME: We should be able to push Namespc here, so that the each DeclContext
	// for the namespace has the declarations that showed up in that particular
	// namespace definition.
	PushDeclContext(NamespcScope, Namespc);
	return Namespc;
	}

	/// getNamespaceDecl - Returns the namespace a decl represents. If the decl
	/// is a namespace alias, returns the namespace it points to.
	static inline NamespaceDecl getNamespaceDecl(NamedDecl D) {
	if (NamespaceAliasDecl *AD = dyn_cast_or_null<NamespaceAliasDecl>(D))
	return AD->getNamespace();
	return dyn_cast_or_null<NamespaceDecl>(D);
	}

	/// ActOnFinishNamespaceDef - This callback is called after a namespace is
	/// exited. Decl is the DeclTy returned by ActOnStartNamespaceDef.
	void Sema::ActOnFinishNamespaceDef(Decl *Dcl, SourceLocation RBrace) {
	NamespaceDecl *Namespc = dyn_cast_or_null<NamespaceDecl>(Dcl);
	assert(Namespc && "Invalid parameter, expected NamespaceDecl");
	Namespc->setRBraceLoc(RBrace);
	PopDeclContext();
	if (Namespc->hasAttr<VisibilityAttr>())
	PopPragmaVisibility(true, RBrace);
	// If this namespace contains an export-declaration, export it now.
	if (DeferredExportedNamespaces.erase(Namespc))
	Dcl->setModuleOwnershipKind(Decl::ModuleOwnershipKind::VisibleWhenImported);
	}

	CXXRecordDecl *Sema::getStdBadAlloc() const {
	return cast_or_null<CXXRecordDecl>(
	StdBadAlloc.get(Context.getExternalSource()));
	}

	EnumDecl *Sema::getStdAlignValT() const {
	return cast_or_null<EnumDecl>(StdAlignValT.get(Context.getExternalSource()));
	}

	NamespaceDecl *Sema::getStdNamespace() const {
	return cast_or_null<NamespaceDecl>(
	StdNamespace.get(Context.getExternalSource()));
	}

	NamespaceDecl *Sema::lookupStdExperimentalNamespace() {
	if (!StdExperimentalNamespaceCache) {
	if (auto Std = getStdNamespace()) {
	LookupResult Result(*this, &PP.getIdentifierTable().get("experimental"),
	SourceLocation(), LookupNamespaceName);
	if (!LookupQualifiedName(Result, Std) \|\|
	!(StdExperimentalNamespaceCache =
	Result.getAsSingle<NamespaceDecl>()))
	Result.suppressDiagnostics();
	}
	}
	return StdExperimentalNamespaceCache;
	}

	namespace {

	enum UnsupportedSTLSelect {
	USS_InvalidMember,
	USS_MissingMember,
	USS_NonTrivial,
	USS_Other
	};

	struct InvalidSTLDiagnoser {
	Sema &S;
	SourceLocation Loc;
	QualType TyForDiags;

	QualType operator()(UnsupportedSTLSelect Sel = USS_Other, StringRef Name = "",
	const VarDecl *VD = nullptr) {
	{
	auto D = S.Diag(Loc, diag::err_std_compare_type_not_supported)
	<< TyForDiags << ((int)Sel);
	if (Sel == USS_InvalidMember \|\| Sel == USS_MissingMember) {
	assert(!Name.empty());
	D << Name;
	}
	}
	if (Sel == USS_InvalidMember) {
	S.Diag(VD->getLocation(), diag::note_var_declared_here)
	<< VD << VD->getSourceRange();
	}
	return QualType();
	}
	};
	} // namespace

	QualType Sema::CheckComparisonCategoryType(ComparisonCategoryType Kind,
	SourceLocation Loc) {
	assert(getLangOpts().CPlusPlus &&
	"Looking for comparison category type outside of C++.");

	// Check if we've already successfully checked the comparison category type
	// before. If so, skip checking it again.
	ComparisonCategoryInfo *Info = Context.CompCategories.lookupInfo(Kind);
	if (Info && FullyCheckedComparisonCategories[static_cast<unsigned>(Kind)])
	return Info->getType();

	// If lookup failed
	if (!Info) {
	std::string NameForDiags = "std::";
	NameForDiags += ComparisonCategories::getCategoryString(Kind);
	Diag(Loc, diag::err_implied_comparison_category_type_not_found)
	<< NameForDiags;
	return QualType();
	}

	assert(Info->Kind == Kind);
	assert(Info->Record);

	// Update the Record decl in case we encountered a forward declaration on our
	// first pass. FIXME: This is a bit of a hack.
	if (Info->Record->hasDefinition())
	Info->Record = Info->Record->getDefinition();

	// Use an elaborated type for diagnostics which has a name containing the
	// prepended 'std' namespace but not any inline namespace names.
	QualType TyForDiags = [&]() {
	auto *NNS =
	NestedNameSpecifier::Create(Context, nullptr, getStdNamespace());
	return Context.getElaboratedType(ETK_None, NNS, Info->getType());
	}();

	if (RequireCompleteType(Loc, TyForDiags, diag::err_incomplete_type))
	return QualType();

	InvalidSTLDiagnoser UnsupportedSTLError{*this, Loc, TyForDiags};

	if (!Info->Record->isTriviallyCopyable())
	return UnsupportedSTLError(USS_NonTrivial);

	for (const CXXBaseSpecifier &BaseSpec : Info->Record->bases()) {
	CXXRecordDecl *Base = BaseSpec.getType()->getAsCXXRecordDecl();
	// Tolerate empty base classes.
	if (Base->isEmpty())
	continue;
	// Reject STL implementations which have at least one non-empty base.
	return UnsupportedSTLError();
	}

	// Check that the STL has implemented the types using a single integer field.
	// This expectation allows better codegen for builtin operators. We require:
	// (1) The class has exactly one field.
	// (2) The field is an integral or enumeration type.
	auto FIt = Info->Record->field_begin(), FEnd = Info->Record->field_end();
	if (std::distance(FIt, FEnd) != 1 \|\|
	!FIt->getType()->isIntegralOrEnumerationType()) {
	return UnsupportedSTLError();
	}

	// Build each of the require values and store them in Info.
	for (ComparisonCategoryResult CCR :
	ComparisonCategories::getPossibleResultsForType(Kind)) {
	StringRef MemName = ComparisonCategories::getResultString(CCR);
	ComparisonCategoryInfo::ValueInfo *ValInfo = Info->lookupValueInfo(CCR);

	if (!ValInfo)
	return UnsupportedSTLError(USS_MissingMember, MemName);

	VarDecl *VD = ValInfo->VD;
	assert(VD && "should not be null!");

	// Attempt to diagnose reasons why the STL definition of this type
	// might be foobar, including it failing to be a constant expression.
	// TODO Handle more ways the lookup or result can be invalid.
	if (!VD->isStaticDataMember() \|\| !VD->isConstexpr() \|\| !VD->hasInit() \|\|
	!VD->checkInitIsICE())
	return UnsupportedSTLError(USS_InvalidMember, MemName, VD);

	// Attempt to evaluate the var decl as a constant expression and extract
	// the value of its first field as a ICE. If this fails, the STL
	// implementation is not supported.
	if (!ValInfo->hasValidIntValue())
	return UnsupportedSTLError();

	MarkVariableReferenced(Loc, VD);
	}

	// We've successfully built the required types and expressions. Update
	// the cache and return the newly cached value.
	FullyCheckedComparisonCategories[static_cast<unsigned>(Kind)] = true;
	return Info->getType();
	}

	/// Retrieve the special "std" namespace, which may require us to
	/// implicitly define the namespace.
	NamespaceDecl *Sema::getOrCreateStdNamespace() {
	if (!StdNamespace) {
	// The "std" namespace has not yet been defined, so build one implicitly.
	StdNamespace = NamespaceDecl::Create(Context,
	Context.getTranslationUnitDecl(),
	/Inline=/false,
	SourceLocation(), SourceLocation(),
	&PP.getIdentifierTable().get("std"),
	/PrevDecl=/nullptr);
	getStdNamespace()->setImplicit(true);
	}

	return getStdNamespace();
	}

	bool Sema::isStdInitializerList(QualType Ty, QualType *Element) {
	assert(getLangOpts().CPlusPlus &&
	"Looking for std::initializer_list outside of C++.");

	// We're looking for implicit instantiations of
	// template <typename E> class std::initializer_list.

	if (!StdNamespace) // If we haven't seen namespace std yet, this can't be it.
	return false;

	ClassTemplateDecl *Template = nullptr;
	const TemplateArgument *Arguments = nullptr;

	if (const RecordType *RT = Ty->getAs<RecordType>()) {

	ClassTemplateSpecializationDecl *Specialization =
	dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl());
	if (!Specialization)
	return false;

	Template = Specialization->getSpecializedTemplate();
	Arguments = Specialization->getTemplateArgs().data();
	} else if (const TemplateSpecializationType *TST =
	Ty->getAs<TemplateSpecializationType>()) {
	Template = dyn_cast_or_null<ClassTemplateDecl>(
	TST->getTemplateName().getAsTemplateDecl());
	Arguments = TST->getArgs();
	}
	if (!Template)
	return false;

	if (!StdInitializerList) {
	// Haven't recognized std::initializer_list yet, maybe this is it.
	CXXRecordDecl *TemplateClass = Template->getTemplatedDecl();
	if (TemplateClass->getIdentifier() !=
	&PP.getIdentifierTable().get("initializer_list") \|\|
	!getStdNamespace()->InEnclosingNamespaceSetOf(
	TemplateClass->getDeclContext()))
	return false;
	// This is a template called std::initializer_list, but is it the right
	// template?
	TemplateParameterList *Params = Template->getTemplateParameters();
	if (Params->getMinRequiredArguments() != 1)
	return false;
	if (!isa<TemplateTypeParmDecl>(Params->getParam(0)))
	return false;

	// It's the right template.
	StdInitializerList = Template;
	}

	if (Template->getCanonicalDecl() != StdInitializerList->getCanonicalDecl())
	return false;

	// This is an instance of std::initializer_list. Find the argument type.
	if (Element)
	*Element = Arguments[0].getAsType();
	return true;
	}

	static ClassTemplateDecl *LookupStdInitializerList(Sema &S, SourceLocation Loc){
	NamespaceDecl *Std = S.getStdNamespace();
	if (!Std) {
	S.Diag(Loc, diag::err_implied_std_initializer_list_not_found);
	return nullptr;
	}

	LookupResult Result(S, &S.PP.getIdentifierTable().get("initializer_list"),
	Loc, Sema::LookupOrdinaryName);
	if (!S.LookupQualifiedName(Result, Std)) {
	S.Diag(Loc, diag::err_implied_std_initializer_list_not_found);
	return nullptr;
	}
	ClassTemplateDecl *Template = Result.getAsSingle<ClassTemplateDecl>();
	if (!Template) {
	Result.suppressDiagnostics();
	// We found something weird. Complain about the first thing we found.
	NamedDecl Found = Result.begin();
	S.Diag(Found->getLocation(), diag::err_malformed_std_initializer_list);
	return nullptr;
	}

	// We found some template called std::initializer_list. Now verify that it's
	// correct.
	TemplateParameterList *Params = Template->getTemplateParameters();
	if (Params->getMinRequiredArguments() != 1 \|\|
	!isa<TemplateTypeParmDecl>(Params->getParam(0))) {
	S.Diag(Template->getLocation(), diag::err_malformed_std_initializer_list);
	return nullptr;
	}

	return Template;
	}

	QualType Sema::BuildStdInitializerList(QualType Element, SourceLocation Loc) {
	if (!StdInitializerList) {
	StdInitializerList = LookupStdInitializerList(*this, Loc);
	if (!StdInitializerList)
	return QualType();
	}

	TemplateArgumentListInfo Args(Loc, Loc);
	Args.addArgument(TemplateArgumentLoc(TemplateArgument(Element),
	Context.getTrivialTypeSourceInfo(Element,
	Loc)));
	return Context.getCanonicalType(
	CheckTemplateIdType(TemplateName(StdInitializerList), Loc, Args));
	}

	bool Sema::isInitListConstructor(const FunctionDecl *Ctor) {
	// C++ [dcl.init.list]p2:
	// A constructor is an initializer-list constructor if its first parameter
	// is of type std::initializer_list<E> or reference to possibly cv-qualified
	// std::initializer_list<E> for some type E, and either there are no other
	// parameters or else all other parameters have default arguments.
	if (Ctor->getNumParams() < 1 \|\|
	(Ctor->getNumParams() > 1 && !Ctor->getParamDecl(1)->hasDefaultArg()))
	return false;

	QualType ArgType = Ctor->getParamDecl(0)->getType();
	if (const ReferenceType *RT = ArgType->getAs<ReferenceType>())
	ArgType = RT->getPointeeType().getUnqualifiedType();

	return isStdInitializerList(ArgType, nullptr);
	}

	/// Determine whether a using statement is in a context where it will be
	/// apply in all contexts.
	static bool IsUsingDirectiveInToplevelContext(DeclContext *CurContext) {
	switch (CurContext->getDeclKind()) {
	case Decl::TranslationUnit:
	return true;
	case Decl::LinkageSpec:
	return IsUsingDirectiveInToplevelContext(CurContext->getParent());
	default:
	return false;
	}
	}

	namespace {

	// Callback to only accept typo corrections that are namespaces.
	class NamespaceValidatorCCC final : public CorrectionCandidateCallback {
	public:
	bool ValidateCandidate(const TypoCorrection &candidate) override {
	if (NamedDecl *ND = candidate.getCorrectionDecl())
	return isa<NamespaceDecl>(ND) \|\| isa<NamespaceAliasDecl>(ND);
	return false;
	}

	std::unique_ptr<CorrectionCandidateCallback> clone() override {
	return llvm::make_unique<NamespaceValidatorCCC>(*this);
	}
	};

	}

	static bool TryNamespaceTypoCorrection(Sema &S, LookupResult &R, Scope *Sc,
	CXXScopeSpec &SS,
	SourceLocation IdentLoc,
	IdentifierInfo *Ident) {
	R.clear();
	NamespaceValidatorCCC CCC{};
	if (TypoCorrection Corrected =
	S.CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), Sc, &SS, CCC,
	Sema::CTK_ErrorRecovery)) {
	if (DeclContext *DC = S.computeDeclContext(SS, false)) {
	std::string CorrectedStr(Corrected.getAsString(S.getLangOpts()));
	bool DroppedSpecifier = Corrected.WillReplaceSpecifier() &&
	Ident->getName().equals(CorrectedStr);
	S.diagnoseTypo(Corrected,
	S.PDiag(diag::err_using_directive_member_suggest)
	<< Ident << DC << DroppedSpecifier << SS.getRange(),
	S.PDiag(diag::note_namespace_defined_here));
	} else {
	S.diagnoseTypo(Corrected,
	S.PDiag(diag::err_using_directive_suggest) << Ident,
	S.PDiag(diag::note_namespace_defined_here));
	}
	R.addDecl(Corrected.getFoundDecl());
	return true;
	}
	return false;
	}

	Decl Sema::ActOnUsingDirective(Scope S, SourceLocation UsingLoc,
	SourceLocation NamespcLoc, CXXScopeSpec &SS,
	SourceLocation IdentLoc,
	IdentifierInfo *NamespcName,
	const ParsedAttributesView &AttrList) {
	assert(!SS.isInvalid() && "Invalid CXXScopeSpec.");
	assert(NamespcName && "Invalid NamespcName.");
	assert(IdentLoc.isValid() && "Invalid NamespceName location.");

	// This can only happen along a recovery path.
	while (S->isTemplateParamScope())
	S = S->getParent();
	assert(S->getFlags() & Scope::DeclScope && "Invalid Scope.");

	UsingDirectiveDecl *UDir = nullptr;
	NestedNameSpecifier *Qualifier = nullptr;
	if (SS.isSet())
	Qualifier = SS.getScopeRep();

	// Lookup namespace name.
	LookupResult R(*this, NamespcName, IdentLoc, LookupNamespaceName);
	LookupParsedName(R, S, &SS);
	if (R.isAmbiguous())
	return nullptr;

	if (R.empty()) {
	R.clear();
	// Allow "using namespace std;" or "using namespace ::std;" even if
	// "std" hasn't been defined yet, for GCC compatibility.
	if ((!Qualifier \|\| Qualifier->getKind() == NestedNameSpecifier::Global) &&
	NamespcName->isStr("std")) {
	Diag(IdentLoc, diag::ext_using_undefined_std);
	R.addDecl(getOrCreateStdNamespace());
	R.resolveKind();
	}
	// Otherwise, attempt typo correction.
	else TryNamespaceTypoCorrection(*this, R, S, SS, IdentLoc, NamespcName);
	}

	if (!R.empty()) {
	NamedDecl *Named = R.getRepresentativeDecl();
	NamespaceDecl *NS = R.getAsSingle<NamespaceDecl>();
	assert(NS && "expected namespace decl");

	// The use of a nested name specifier may trigger deprecation warnings.
	DiagnoseUseOfDecl(Named, IdentLoc);

	// C++ [namespace.udir]p1:
	// A using-directive specifies that the names in the nominated
	// namespace can be used in the scope in which the
	// using-directive appears after the using-directive. During
	// unqualified name lookup (3.4.1), the names appear as if they
	// were declared in the nearest enclosing namespace which
	// contains both the using-directive and the nominated
	// namespace. [Note: in this context, "contains" means "contains
	// directly or indirectly". ]

	// Find enclosing context containing both using-directive and
	// nominated namespace.
	DeclContext *CommonAncestor = NS;
	while (CommonAncestor && !CommonAncestor->Encloses(CurContext))
	CommonAncestor = CommonAncestor->getParent();

	UDir = UsingDirectiveDecl::Create(Context, CurContext, UsingLoc, NamespcLoc,
	SS.getWithLocInContext(Context),
	IdentLoc, Named, CommonAncestor);

	if (IsUsingDirectiveInToplevelContext(CurContext) &&
	!SourceMgr.isInMainFile(SourceMgr.getExpansionLoc(IdentLoc))) {
	Diag(IdentLoc, diag::warn_using_directive_in_header);
	}

	PushUsingDirective(S, UDir);
	} else {
	Diag(IdentLoc, diag::err_expected_namespace_name) << SS.getRange();
	}

	if (UDir)
	ProcessDeclAttributeList(S, UDir, AttrList);

	return UDir;
	}

	void Sema::PushUsingDirective(Scope S, UsingDirectiveDecl UDir) {
	// If the scope has an associated entity and the using directive is at
	// namespace or translation unit scope, add the UsingDirectiveDecl into
	// its lookup structure so qualified name lookup can find it.
	DeclContext *Ctx = S->getEntity();
	if (Ctx && !Ctx->isFunctionOrMethod())
	Ctx->addDecl(UDir);
	else
	// Otherwise, it is at block scope. The using-directives will affect lookup
	// only to the end of the scope.
	S->PushUsingDirective(UDir);
	}

	Decl Sema::ActOnUsingDeclaration(Scope S, AccessSpecifier AS,
	SourceLocation UsingLoc,
	SourceLocation TypenameLoc, CXXScopeSpec &SS,
	UnqualifiedId &Name,
	SourceLocation EllipsisLoc,
	const ParsedAttributesView &AttrList) {
	assert(S->getFlags() & Scope::DeclScope && "Invalid Scope.");

	if (SS.isEmpty()) {
	Diag(Name.getBeginLoc(), diag::err_using_requires_qualname);
	return nullptr;
	}

	switch (Name.getKind()) {
	case UnqualifiedIdKind::IK_ImplicitSelfParam:
	case UnqualifiedIdKind::IK_Identifier:
	case UnqualifiedIdKind::IK_OperatorFunctionId:
	case UnqualifiedIdKind::IK_LiteralOperatorId:
	case UnqualifiedIdKind::IK_ConversionFunctionId:
	break;

	case UnqualifiedIdKind::IK_ConstructorName:
	case UnqualifiedIdKind::IK_ConstructorTemplateId:
	// C++11 inheriting constructors.
	Diag(Name.getBeginLoc(),
	getLangOpts().CPlusPlus11
	? diag::warn_cxx98_compat_using_decl_constructor
	: diag::err_using_decl_constructor)
	<< SS.getRange();

	if (getLangOpts().CPlusPlus11) break;

	return nullptr;

	case UnqualifiedIdKind::IK_DestructorName:
	Diag(Name.getBeginLoc(), diag::err_using_decl_destructor) << SS.getRange();
	return nullptr;

	case UnqualifiedIdKind::IK_TemplateId:
	Diag(Name.getBeginLoc(), diag::err_using_decl_template_id)
	<< SourceRange(Name.TemplateId->LAngleLoc, Name.TemplateId->RAngleLoc);
	return nullptr;

	case UnqualifiedIdKind::IK_DeductionGuideName:
	llvm_unreachable("cannot parse qualified deduction guide name");
	}

	DeclarationNameInfo TargetNameInfo = GetNameFromUnqualifiedId(Name);
	DeclarationName TargetName = TargetNameInfo.getName();
	if (!TargetName)
	return nullptr;

	// Warn about access declarations.
	if (UsingLoc.isInvalid()) {
	Diag(Name.getBeginLoc(), getLangOpts().CPlusPlus11
	? diag::err_access_decl
	: diag::warn_access_decl_deprecated)
	<< FixItHint::CreateInsertion(SS.getRange().getBegin(), "using ");
	}

	if (EllipsisLoc.isInvalid()) {
	if (DiagnoseUnexpandedParameterPack(SS, UPPC_UsingDeclaration) \|\|
	DiagnoseUnexpandedParameterPack(TargetNameInfo, UPPC_UsingDeclaration))
	return nullptr;
	} else {
	if (!SS.getScopeRep()->containsUnexpandedParameterPack() &&
	!TargetNameInfo.containsUnexpandedParameterPack()) {
	Diag(EllipsisLoc, diag::err_pack_expansion_without_parameter_packs)
	<< SourceRange(SS.getBeginLoc(), TargetNameInfo.getEndLoc());
	EllipsisLoc = SourceLocation();
	}
	}

	NamedDecl *UD =
	BuildUsingDeclaration(S, AS, UsingLoc, TypenameLoc.isValid(), TypenameLoc,
	SS, TargetNameInfo, EllipsisLoc, AttrList,
	/IsInstantiation/false);
	if (UD)
	PushOnScopeChains(UD, S, /AddToContext/ false);

	return UD;
	}

	/// Determine whether a using declaration considers the given
	/// declarations as "equivalent", e.g., if they are redeclarations of
	/// the same entity or are both typedefs of the same type.
	static bool
	IsEquivalentForUsingDecl(ASTContext &Context, NamedDecl D1, NamedDecl D2) {
	if (D1->getCanonicalDecl() == D2->getCanonicalDecl())
	return true;

	if (TypedefNameDecl *TD1 = dyn_cast<TypedefNameDecl>(D1))
	if (TypedefNameDecl *TD2 = dyn_cast<TypedefNameDecl>(D2))
	return Context.hasSameType(TD1->getUnderlyingType(),
	TD2->getUnderlyingType());

	return false;
	}


	/// Determines whether to create a using shadow decl for a particular
	/// decl, given the set of decls existing prior to this using lookup.
	bool Sema::CheckUsingShadowDecl(UsingDecl Using, NamedDecl Orig,
	const LookupResult &Previous,
	UsingShadowDecl *&PrevShadow) {
	// Diagnose finding a decl which is not from a base class of the
	// current class. We do this now because there are cases where this
	// function will silently decide not to build a shadow decl, which
	// will pre-empt further diagnostics.
	//
	// We don't need to do this in C++11 because we do the check once on
	// the qualifier.
	//
	// FIXME: diagnose the following if we care enough:
	// struct A { int foo; };
	// struct B : A { using A::foo; };
	// template <class T> struct C : A {};
	// template <class T> struct D : C<T> { using B::foo; } // <---
	// This is invalid (during instantiation) in C++03 because B::foo
	// resolves to the using decl in B, which is not a base class of D<T>.
	// We can't diagnose it immediately because C<T> is an unknown
	// specialization. The UsingShadowDecl in D<T> then points directly
	// to A::foo, which will look well-formed when we instantiate.
	// The right solution is to not collapse the shadow-decl chain.
	if (!getLangOpts().CPlusPlus11 && CurContext->isRecord()) {
	DeclContext *OrigDC = Orig->getDeclContext();

	// Handle enums and anonymous structs.
	if (isa<EnumDecl>(OrigDC)) OrigDC = OrigDC->getParent();
	CXXRecordDecl *OrigRec = cast<CXXRecordDecl>(OrigDC);
	while (OrigRec->isAnonymousStructOrUnion())
	OrigRec = cast<CXXRecordDecl>(OrigRec->getDeclContext());

	if (cast<CXXRecordDecl>(CurContext)->isProvablyNotDerivedFrom(OrigRec)) {
	if (OrigDC == CurContext) {
	Diag(Using->getLocation(),
	diag::err_using_decl_nested_name_specifier_is_current_class)
	<< Using->getQualifierLoc().getSourceRange();
	Diag(Orig->getLocation(), diag::note_using_decl_target);
	Using->setInvalidDecl();
	return true;
	}

	Diag(Using->getQualifierLoc().getBeginLoc(),
	diag::err_using_decl_nested_name_specifier_is_not_base_class)
	<< Using->getQualifier()
	<< cast<CXXRecordDecl>(CurContext)
	<< Using->getQualifierLoc().getSourceRange();
	Diag(Orig->getLocation(), diag::note_using_decl_target);
	Using->setInvalidDecl();
	return true;
	}
	}

	if (Previous.empty()) return false;

	NamedDecl *Target = Orig;
	if (isa<UsingShadowDecl>(Target))
	Target = cast<UsingShadowDecl>(Target)->getTargetDecl();

	// If the target happens to be one of the previous declarations, we
	// don't have a conflict.
	//
	// FIXME: but we might be increasing its access, in which case we
	// should redeclare it.
	NamedDecl NonTag = nullptr, Tag = nullptr;
	bool FoundEquivalentDecl = false;
	for (LookupResult::iterator I = Previous.begin(), E = Previous.end();
	I != E; ++I) {
	NamedDecl D = (I)->getUnderlyingDecl();
	// We can have UsingDecls in our Previous results because we use the same
	// LookupResult for checking whether the UsingDecl itself is a valid
	// redeclaration.
	if (isa<UsingDecl>(D) \|\| isa<UsingPackDecl>(D))
	continue;

	if (auto *RD = dyn_cast<CXXRecordDecl>(D)) {
	// C++ [class.mem]p19:
	// If T is the name of a class, then [every named member other than
	// a non-static data member] shall have a name different from T
	if (RD->isInjectedClassName() && !isa<FieldDecl>(Target) &&
	!isa<IndirectFieldDecl>(Target) &&
	!isa<UnresolvedUsingValueDecl>(Target) &&
	DiagnoseClassNameShadow(
	CurContext,
	DeclarationNameInfo(Using->getDeclName(), Using->getLocation())))
	return true;
	}

	if (IsEquivalentForUsingDecl(Context, D, Target)) {
	if (UsingShadowDecl Shadow = dyn_cast<UsingShadowDecl>(I))
	PrevShadow = Shadow;
	FoundEquivalentDecl = true;
	} else if (isEquivalentInternalLinkageDeclaration(D, Target)) {
	// We don't conflict with an existing using shadow decl of an equivalent
	// declaration, but we're not a redeclaration of it.
	FoundEquivalentDecl = true;
	}

	if (isVisible(D))
	(isa<TagDecl>(D) ? Tag : NonTag) = D;
	}

	if (FoundEquivalentDecl)
	return false;

	if (FunctionDecl *FD = Target->getAsFunction()) {
	NamedDecl *OldDecl = nullptr;
	switch (CheckOverload(nullptr, FD, Previous, OldDecl,
	/IsForUsingDecl/ true)) {
	case Ovl_Overload:
	return false;

	case Ovl_NonFunction:
	Diag(Using->getLocation(), diag::err_using_decl_conflict);
	break;

	// We found a decl with the exact signature.
	case Ovl_Match:
	// If we're in a record, we want to hide the target, so we
	// return true (without a diagnostic) to tell the caller not to
	// build a shadow decl.
	if (CurContext->isRecord())
	return true;

	// If we're not in a record, this is an error.
	Diag(Using->getLocation(), diag::err_using_decl_conflict);
	break;
	}

	Diag(Target->getLocation(), diag::note_using_decl_target);
	Diag(OldDecl->getLocation(), diag::note_using_decl_conflict);
	Using->setInvalidDecl();
	return true;
	}

	// Target is not a function.

	if (isa<TagDecl>(Target)) {
	// No conflict between a tag and a non-tag.
	if (!Tag) return false;

	Diag(Using->getLocation(), diag::err_using_decl_conflict);
	Diag(Target->getLocation(), diag::note_using_decl_target);
	Diag(Tag->getLocation(), diag::note_using_decl_conflict);
	Using->setInvalidDecl();
	return true;
	}

	// No conflict between a tag and a non-tag.
	if (!NonTag) return false;

	Diag(Using->getLocation(), diag::err_using_decl_conflict);
	Diag(Target->getLocation(), diag::note_using_decl_target);
	Diag(NonTag->getLocation(), diag::note_using_decl_conflict);
	Using->setInvalidDecl();
	return true;
	}

	/// Determine whether a direct base class is a virtual base class.
	static bool isVirtualDirectBase(CXXRecordDecl Derived, CXXRecordDecl Base) {
	if (!Derived->getNumVBases())
	return false;
	for (auto &B : Derived->bases())
	if (B.getType()->getAsCXXRecordDecl() == Base)
	return B.isVirtual();
	llvm_unreachable("not a direct base class");
	}

	/// Builds a shadow declaration corresponding to a 'using' declaration.
	UsingShadowDecl Sema::BuildUsingShadowDecl(Scope S,
	UsingDecl *UD,
	NamedDecl *Orig,
	UsingShadowDecl *PrevDecl) {
	// If we resolved to another shadow declaration, just coalesce them.
	NamedDecl *Target = Orig;
	if (isa<UsingShadowDecl>(Target)) {
	Target = cast<UsingShadowDecl>(Target)->getTargetDecl();
	assert(!isa<UsingShadowDecl>(Target) && "nested shadow declaration");
	}

	NamedDecl *NonTemplateTarget = Target;
	if (auto *TargetTD = dyn_cast<TemplateDecl>(Target))
	NonTemplateTarget = TargetTD->getTemplatedDecl();

	UsingShadowDecl *Shadow;
	if (NonTemplateTarget && isa<CXXConstructorDecl>(NonTemplateTarget)) {
	bool IsVirtualBase =
	isVirtualDirectBase(cast<CXXRecordDecl>(CurContext),
	UD->getQualifier()->getAsRecordDecl());
	Shadow = ConstructorUsingShadowDecl::Create(
	Context, CurContext, UD->getLocation(), UD, Orig, IsVirtualBase);
	} else {
	Shadow = UsingShadowDecl::Create(Context, CurContext, UD->getLocation(), UD,
	Target);
	}
	UD->addShadowDecl(Shadow);

	Shadow->setAccess(UD->getAccess());
	if (Orig->isInvalidDecl() \|\| UD->isInvalidDecl())
	Shadow->setInvalidDecl();

	Shadow->setPreviousDecl(PrevDecl);

	if (S)
	PushOnScopeChains(Shadow, S);
	else
	CurContext->addDecl(Shadow);


	return Shadow;
	}

	/// Hides a using shadow declaration. This is required by the current
	/// using-decl implementation when a resolvable using declaration in a
	/// class is followed by a declaration which would hide or override
	/// one or more of the using decl's targets; for example:
	///
	/// struct Base { void foo(int); };
	/// struct Derived : Base {
	/// using Base::foo;
	/// void foo(int);
	/// };
	///
	/// The governing language is C++03 [namespace.udecl]p12:
	///
	/// When a using-declaration brings names from a base class into a
	/// derived class scope, member functions in the derived class
	/// override and/or hide member functions with the same name and
	/// parameter types in a base class (rather than conflicting).
	///
	/// There are two ways to implement this:
	/// (1) optimistically create shadow decls when they're not hidden
	/// by existing declarations, or
	/// (2) don't create any shadow decls (or at least don't make them
	/// visible) until we've fully parsed/instantiated the class.
	/// The problem with (1) is that we might have to retroactively remove
	/// a shadow decl, which requires several O(n) operations because the
	/// decl structures are (very reasonably) not designed for removal.
	/// (2) avoids this but is very fiddly and phase-dependent.
	void Sema::HideUsingShadowDecl(Scope S, UsingShadowDecl Shadow) {
	if (Shadow->getDeclName().getNameKind() ==
	DeclarationName::CXXConversionFunctionName)
	cast<CXXRecordDecl>(Shadow->getDeclContext())->removeConversion(Shadow);

	// Remove it from the DeclContext...
	Shadow->getDeclContext()->removeDecl(Shadow);

	// ...and the scope, if applicable...
	if (S) {
	S->RemoveDecl(Shadow);
	IdResolver.RemoveDecl(Shadow);
	}

	// ...and the using decl.
	Shadow->getUsingDecl()->removeShadowDecl(Shadow);

	// TODO: complain somehow if Shadow was used. It shouldn't
	// be possible for this to happen, because...?
	}

	/// Find the base specifier for a base class with the given type.
	static CXXBaseSpecifier findDirectBaseWithType(CXXRecordDecl Derived,
	QualType DesiredBase,
	bool &AnyDependentBases) {
	// Check whether the named type is a direct base class.
	CanQualType CanonicalDesiredBase = DesiredBase->getCanonicalTypeUnqualified();
	for (auto &Base : Derived->bases()) {
	CanQualType BaseType = Base.getType()->getCanonicalTypeUnqualified();
	if (CanonicalDesiredBase == BaseType)
	return &Base;
	if (BaseType->isDependentType())
	AnyDependentBases = true;
	}
	return nullptr;
	}

	namespace {
	class UsingValidatorCCC final : public CorrectionCandidateCallback {
	public:
	UsingValidatorCCC(bool HasTypenameKeyword, bool IsInstantiation,
	NestedNameSpecifier NNS, CXXRecordDecl RequireMemberOf)
	: HasTypenameKeyword(HasTypenameKeyword),
	IsInstantiation(IsInstantiation), OldNNS(NNS),
	RequireMemberOf(RequireMemberOf) {}

	bool ValidateCandidate(const TypoCorrection &Candidate) override {
	NamedDecl *ND = Candidate.getCorrectionDecl();

	// Keywords are not valid here.
	if (!ND \|\| isa<NamespaceDecl>(ND))
	return false;

	// Completely unqualified names are invalid for a 'using' declaration.
	if (Candidate.WillReplaceSpecifier() && !Candidate.getCorrectionSpecifier())
	return false;

	// FIXME: Don't correct to a name that CheckUsingDeclRedeclaration would
	// reject.

	if (RequireMemberOf) {
	auto *FoundRecord = dyn_cast<CXXRecordDecl>(ND);
	if (FoundRecord && FoundRecord->isInjectedClassName()) {
	// No-one ever wants a using-declaration to name an injected-class-name
	// of a base class, unless they're declaring an inheriting constructor.
	ASTContext &Ctx = ND->getASTContext();
	if (!Ctx.getLangOpts().CPlusPlus11)
	return false;
	QualType FoundType = Ctx.getRecordType(FoundRecord);

	// Check that the injected-class-name is named as a member of its own
	// type; we don't want to suggest 'using Derived::Base;', since that
	// means something else.
	NestedNameSpecifier *Specifier =
	Candidate.WillReplaceSpecifier()
	? Candidate.getCorrectionSpecifier()
	: OldNNS;
	if (!Specifier->getAsType() \|\|
	!Ctx.hasSameType(QualType(Specifier->getAsType(), 0), FoundType))
	return false;

	// Check that this inheriting constructor declaration actually names a
	// direct base class of the current class.
	bool AnyDependentBases = false;
	if (!findDirectBaseWithType(RequireMemberOf,
	Ctx.getRecordType(FoundRecord),
	AnyDependentBases) &&
	!AnyDependentBases)
	return false;
	} else {
	auto *RD = dyn_cast<CXXRecordDecl>(ND->getDeclContext());
	if (!RD \|\| RequireMemberOf->isProvablyNotDerivedFrom(RD))
	return false;

	// FIXME: Check that the base class member is accessible?
	}
	} else {
	auto *FoundRecord = dyn_cast<CXXRecordDecl>(ND);
	if (FoundRecord && FoundRecord->isInjectedClassName())
	return false;
	}

	if (isa<TypeDecl>(ND))
	return HasTypenameKeyword \|\| !IsInstantiation;

	return !HasTypenameKeyword;
	}

	std::unique_ptr<CorrectionCandidateCallback> clone() override {
	return llvm::make_unique<UsingValidatorCCC>(*this);
	}

	private:
	bool HasTypenameKeyword;
	bool IsInstantiation;
	NestedNameSpecifier *OldNNS;
	CXXRecordDecl *RequireMemberOf;
	};
	} // end anonymous namespace

	/// Builds a using declaration.
	///
	/// \param IsInstantiation - Whether this call arises from an
	/// instantiation of an unresolved using declaration. We treat
	/// the lookup differently for these declarations.
	NamedDecl *Sema::BuildUsingDeclaration(
	Scope *S, AccessSpecifier AS, SourceLocation UsingLoc,
	bool HasTypenameKeyword, SourceLocation TypenameLoc, CXXScopeSpec &SS,
	DeclarationNameInfo NameInfo, SourceLocation EllipsisLoc,
	const ParsedAttributesView &AttrList, bool IsInstantiation) {
	assert(!SS.isInvalid() && "Invalid CXXScopeSpec.");
	SourceLocation IdentLoc = NameInfo.getLoc();
	assert(IdentLoc.isValid() && "Invalid TargetName location.");

	// FIXME: We ignore attributes for now.

	// For an inheriting constructor declaration, the name of the using
	// declaration is the name of a constructor in this class, not in the
	// base class.
	DeclarationNameInfo UsingName = NameInfo;
	if (UsingName.getName().getNameKind() == DeclarationName::CXXConstructorName)
	if (auto *RD = dyn_cast<CXXRecordDecl>(CurContext))
	UsingName.setName(Context.DeclarationNames.getCXXConstructorName(
	Context.getCanonicalType(Context.getRecordType(RD))));

	// Do the redeclaration lookup in the current scope.
	LookupResult Previous(*this, UsingName, LookupUsingDeclName,
	ForVisibleRedeclaration);
	Previous.setHideTags(false);
	if (S) {
	LookupName(Previous, S);

	// It is really dumb that we have to do this.
	LookupResult::Filter F = Previous.makeFilter();
	while (F.hasNext()) {
	NamedDecl *D = F.next();
	if (!isDeclInScope(D, CurContext, S))
	F.erase();
	// If we found a local extern declaration that's not ordinarily visible,
	// and this declaration is being added to a non-block scope, ignore it.
	// We're only checking for scope conflicts here, not also for violations
	// of the linkage rules.
	else if (!CurContext->isFunctionOrMethod() && D->isLocalExternDecl() &&
	!(D->getIdentifierNamespace() & Decl::IDNS_Ordinary))
	F.erase();
	}
	F.done();
	} else {
	assert(IsInstantiation && "no scope in non-instantiation");
	if (CurContext->isRecord())
	LookupQualifiedName(Previous, CurContext);
	else {
	// No redeclaration check is needed here; in non-member contexts we
	// diagnosed all possible conflicts with other using-declarations when
	// building the template:
	//
	// For a dependent non-type using declaration, the only valid case is
	// if we instantiate to a single enumerator. We check for conflicts
	// between shadow declarations we introduce, and we check in the template
	// definition for conflicts between a non-type using declaration and any
	// other declaration, which together covers all cases.
	//
	// A dependent typename using declaration will never successfully
	// instantiate, since it will always name a class member, so we reject
	// that in the template definition.
	}
	}

	// Check for invalid redeclarations.
	if (CheckUsingDeclRedeclaration(UsingLoc, HasTypenameKeyword,
	SS, IdentLoc, Previous))
	return nullptr;

	// Check for bad qualifiers.
	if (CheckUsingDeclQualifier(UsingLoc, HasTypenameKeyword, SS, NameInfo,
	IdentLoc))
	return nullptr;

	DeclContext *LookupContext = computeDeclContext(SS);
	NamedDecl *D;
	NestedNameSpecifierLoc QualifierLoc = SS.getWithLocInContext(Context);
	if (!LookupContext \|\| EllipsisLoc.isValid()) {
	if (HasTypenameKeyword) {
	// FIXME: not all declaration name kinds are legal here
	D = UnresolvedUsingTypenameDecl::Create(Context, CurContext,
	UsingLoc, TypenameLoc,
	QualifierLoc,
	IdentLoc, NameInfo.getName(),
	EllipsisLoc);
	} else {
	D = UnresolvedUsingValueDecl::Create(Context, CurContext, UsingLoc,
	QualifierLoc, NameInfo, EllipsisLoc);
	}
	D->setAccess(AS);
	CurContext->addDecl(D);
	return D;
	}

	auto Build = [&](bool Invalid) {
	UsingDecl *UD =
	UsingDecl::Create(Context, CurContext, UsingLoc, QualifierLoc,
	UsingName, HasTypenameKeyword);
	UD->setAccess(AS);
	CurContext->addDecl(UD);
	UD->setInvalidDecl(Invalid);
	return UD;
	};
	auto BuildInvalid = [&]{ return Build(true); };
	auto BuildValid = [&]{ return Build(false); };

	if (RequireCompleteDeclContext(SS, LookupContext))
	return BuildInvalid();

	// Look up the target name.
	LookupResult R(*this, NameInfo, LookupOrdinaryName);

	// Unlike most lookups, we don't always want to hide tag
	// declarations: tag names are visible through the using declaration
	// even if hidden by ordinary names, except in a dependent context
	// where it's important for the sanity of two-phase lookup.
	if (!IsInstantiation)
	R.setHideTags(false);

	// For the purposes of this lookup, we have a base object type
	// equal to that of the current context.
	if (CurContext->isRecord()) {
	R.setBaseObjectType(
	Context.getTypeDeclType(cast<CXXRecordDecl>(CurContext)));
	}

	LookupQualifiedName(R, LookupContext);

	// Try to correct typos if possible. If constructor name lookup finds no
	// results, that means the named class has no explicit constructors, and we
	// suppressed declaring implicit ones (probably because it's dependent or
	// invalid).
	if (R.empty() &&
	NameInfo.getName().getNameKind() != DeclarationName::CXXConstructorName) {
	// HACK: Work around a bug in libstdc++'s detection of ::gets. Sometimes
	// it will believe that glibc provides a ::gets in cases where it does not,
	// and will try to pull it into namespace std with a using-declaration.
	// Just ignore the using-declaration in that case.
	auto *II = NameInfo.getName().getAsIdentifierInfo();
	if (getLangOpts().CPlusPlus14 && II && II->isStr("gets") &&
	CurContext->isStdNamespace() &&
	isa<TranslationUnitDecl>(LookupContext) &&
	getSourceManager().isInSystemHeader(UsingLoc))
	return nullptr;
	UsingValidatorCCC CCC(HasTypenameKeyword, IsInstantiation, SS.getScopeRep(),
	dyn_cast<CXXRecordDecl>(CurContext));
	if (TypoCorrection Corrected =
	CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), S, &SS, CCC,
	CTK_ErrorRecovery)) {
	// We reject candidates where DroppedSpecifier == true, hence the
	// literal '0' below.
	diagnoseTypo(Corrected, PDiag(diag::err_no_member_suggest)
	<< NameInfo.getName() << LookupContext << 0
	<< SS.getRange());

	// If we picked a correction with no attached Decl we can't do anything
	// useful with it, bail out.
	NamedDecl *ND = Corrected.getCorrectionDecl();
	if (!ND)
	return BuildInvalid();

	// If we corrected to an inheriting constructor, handle it as one.
	auto *RD = dyn_cast<CXXRecordDecl>(ND);
	if (RD && RD->isInjectedClassName()) {
	// The parent of the injected class name is the class itself.
	RD = cast<CXXRecordDecl>(RD->getParent());

	// Fix up the information we'll use to build the using declaration.
	if (Corrected.WillReplaceSpecifier()) {
	NestedNameSpecifierLocBuilder Builder;
	Builder.MakeTrivial(Context, Corrected.getCorrectionSpecifier(),
	QualifierLoc.getSourceRange());
	QualifierLoc = Builder.getWithLocInContext(Context);
	}

	// In this case, the name we introduce is the name of a derived class
	// constructor.
	auto *CurClass = cast<CXXRecordDecl>(CurContext);
	UsingName.setName(Context.DeclarationNames.getCXXConstructorName(
	Context.getCanonicalType(Context.getRecordType(CurClass))));
	UsingName.setNamedTypeInfo(nullptr);
	for (auto *Ctor : LookupConstructors(RD))
	R.addDecl(Ctor);
	R.resolveKind();
	} else {
	// FIXME: Pick up all the declarations if we found an overloaded
	// function.
	UsingName.setName(ND->getDeclName());
	R.addDecl(ND);
	}
	} else {
	Diag(IdentLoc, diag::err_no_member)
	<< NameInfo.getName() << LookupContext << SS.getRange();
	return BuildInvalid();
	}
	}

	if (R.isAmbiguous())
	return BuildInvalid();

	if (HasTypenameKeyword) {
	// If we asked for a typename and got a non-type decl, error out.
	if (!R.getAsSingle<TypeDecl>()) {
	Diag(IdentLoc, diag::err_using_typename_non_type);
	for (LookupResult::iterator I = R.begin(), E = R.end(); I != E; ++I)
	Diag((*I)->getUnderlyingDecl()->getLocation(),
	diag::note_using_decl_target);
	return BuildInvalid();
	}
	} else {
	// If we asked for a non-typename and we got a type, error out,
	// but only if this is an instantiation of an unresolved using
	// decl. Otherwise just silently find the type name.
	if (IsInstantiation && R.getAsSingle<TypeDecl>()) {
	Diag(IdentLoc, diag::err_using_dependent_value_is_type);
	Diag(R.getFoundDecl()->getLocation(), diag::note_using_decl_target);
	return BuildInvalid();
	}
	}

	// C++14 [namespace.udecl]p6:
	// A using-declaration shall not name a namespace.
	if (R.getAsSingle<NamespaceDecl>()) {
	Diag(IdentLoc, diag::err_using_decl_can_not_refer_to_namespace)
	<< SS.getRange();
	return BuildInvalid();
	}

	// C++14 [namespace.udecl]p7:
	// A using-declaration shall not name a scoped enumerator.
	if (auto *ED = R.getAsSingle<EnumConstantDecl>()) {
	if (cast<EnumDecl>(ED->getDeclContext())->isScoped()) {
	Diag(IdentLoc, diag::err_using_decl_can_not_refer_to_scoped_enum)
	<< SS.getRange();
	return BuildInvalid();
	}
	}

	UsingDecl *UD = BuildValid();

	// Some additional rules apply to inheriting constructors.
	if (UsingName.getName().getNameKind() ==
	DeclarationName::CXXConstructorName) {
	// Suppress access diagnostics; the access check is instead performed at the
	// point of use for an inheriting constructor.
	R.suppressDiagnostics();
	if (CheckInheritingConstructorUsingDecl(UD))
	return UD;
	}

	for (LookupResult::iterator I = R.begin(), E = R.end(); I != E; ++I) {
	UsingShadowDecl *PrevDecl = nullptr;
	if (!CheckUsingShadowDecl(UD, *I, Previous, PrevDecl))
	BuildUsingShadowDecl(S, UD, *I, PrevDecl);
	}

	return UD;
	}

	NamedDecl Sema::BuildUsingPackDecl(NamedDecl InstantiatedFrom,
	ArrayRef<NamedDecl *> Expansions) {
	assert(isa<UnresolvedUsingValueDecl>(InstantiatedFrom) \|\|
	isa<UnresolvedUsingTypenameDecl>(InstantiatedFrom) \|\|
	isa<UsingPackDecl>(InstantiatedFrom));

	auto *UPD =
	UsingPackDecl::Create(Context, CurContext, InstantiatedFrom, Expansions);
	UPD->setAccess(InstantiatedFrom->getAccess());
	CurContext->addDecl(UPD);
	return UPD;
	}

	/// Additional checks for a using declaration referring to a constructor name.
	bool Sema::CheckInheritingConstructorUsingDecl(UsingDecl *UD) {
	assert(!UD->hasTypename() && "expecting a constructor name");

	const Type *SourceType = UD->getQualifier()->getAsType();
	assert(SourceType &&
	"Using decl naming constructor doesn't have type in scope spec.");
	CXXRecordDecl *TargetClass = cast<CXXRecordDecl>(CurContext);

	// Check whether the named type is a direct base class.
	bool AnyDependentBases = false;
	auto *Base = findDirectBaseWithType(TargetClass, QualType(SourceType, 0),
	AnyDependentBases);
	if (!Base && !AnyDependentBases) {
	Diag(UD->getUsingLoc(),
	diag::err_using_decl_constructor_not_in_direct_base)
	<< UD->getNameInfo().getSourceRange()
	<< QualType(SourceType, 0) << TargetClass;
	UD->setInvalidDecl();
	return true;
	}

	if (Base)
	Base->setInheritConstructors();

	return false;
	}

	/// Checks that the given using declaration is not an invalid
	/// redeclaration. Note that this is checking only for the using decl
	/// itself, not for any ill-formedness among the UsingShadowDecls.
	bool Sema::CheckUsingDeclRedeclaration(SourceLocation UsingLoc,
	bool HasTypenameKeyword,
	const CXXScopeSpec &SS,
	SourceLocation NameLoc,
	const LookupResult &Prev) {
	NestedNameSpecifier *Qual = SS.getScopeRep();

	// C++03 [namespace.udecl]p8:
	// C++0x [namespace.udecl]p10:
	// A using-declaration is a declaration and can therefore be used
	// repeatedly where (and only where) multiple declarations are
	// allowed.
	//
	// That's in non-member contexts.
	if (!CurContext->getRedeclContext()->isRecord()) {
	// A dependent qualifier outside a class can only ever resolve to an
	// enumeration type. Therefore it conflicts with any other non-type
	// declaration in the same scope.
	// FIXME: How should we check for dependent type-type conflicts at block
	// scope?
	if (Qual->isDependent() && !HasTypenameKeyword) {
	for (auto *D : Prev) {
	if (!isa<TypeDecl>(D) && !isa<UsingDecl>(D) && !isa<UsingPackDecl>(D)) {
	bool OldCouldBeEnumerator =
	isa<UnresolvedUsingValueDecl>(D) \|\| isa<EnumConstantDecl>(D);
	Diag(NameLoc,
	OldCouldBeEnumerator ? diag::err_redefinition
	: diag::err_redefinition_different_kind)
	<< Prev.getLookupName();
	Diag(D->getLocation(), diag::note_previous_definition);
	return true;
	}
	}
	}
	return false;
	}

	for (LookupResult::iterator I = Prev.begin(), E = Prev.end(); I != E; ++I) {
	NamedDecl D = I;

	bool DTypename;
	NestedNameSpecifier *DQual;
	if (UsingDecl *UD = dyn_cast<UsingDecl>(D)) {
	DTypename = UD->hasTypename();
	DQual = UD->getQualifier();
	} else if (UnresolvedUsingValueDecl *UD
	= dyn_cast<UnresolvedUsingValueDecl>(D)) {
	DTypename = false;
	DQual = UD->getQualifier();
	} else if (UnresolvedUsingTypenameDecl *UD
	= dyn_cast<UnresolvedUsingTypenameDecl>(D)) {
	DTypename = true;
	DQual = UD->getQualifier();
	} else continue;

	// using decls differ if one says 'typename' and the other doesn't.
	// FIXME: non-dependent using decls?
	if (HasTypenameKeyword != DTypename) continue;

	// using decls differ if they name different scopes (but note that
	// template instantiation can cause this check to trigger when it
	// didn't before instantiation).
	if (Context.getCanonicalNestedNameSpecifier(Qual) !=
	Context.getCanonicalNestedNameSpecifier(DQual))
	continue;

	Diag(NameLoc, diag::err_using_decl_redeclaration) << SS.getRange();
	Diag(D->getLocation(), diag::note_using_decl) << 1;
	return true;
	}

	return false;
	}


	/// Checks that the given nested-name qualifier used in a using decl
	/// in the current context is appropriately related to the current
	/// scope. If an error is found, diagnoses it and returns true.
	bool Sema::CheckUsingDeclQualifier(SourceLocation UsingLoc,
	bool HasTypename,
	const CXXScopeSpec &SS,
	const DeclarationNameInfo &NameInfo,
	SourceLocation NameLoc) {
	DeclContext *NamedContext = computeDeclContext(SS);

	if (!CurContext->isRecord()) {
	// C++03 [namespace.udecl]p3:
	// C++0x [namespace.udecl]p8:
	// A using-declaration for a class member shall be a member-declaration.

	// If we weren't able to compute a valid scope, it might validly be a
	// dependent class scope or a dependent enumeration unscoped scope. If
	// we have a 'typename' keyword, the scope must resolve to a class type.
	if ((HasTypename && !NamedContext) \|\|
	(NamedContext && NamedContext->getRedeclContext()->isRecord())) {
	auto *RD = NamedContext
	? cast<CXXRecordDecl>(NamedContext->getRedeclContext())
	: nullptr;
	if (RD && RequireCompleteDeclContext(const_cast<CXXScopeSpec&>(SS), RD))
	RD = nullptr;

	Diag(NameLoc, diag::err_using_decl_can_not_refer_to_class_member)
	<< SS.getRange();

	// If we have a complete, non-dependent source type, try to suggest a
	// way to get the same effect.
	if (!RD)
	return true;

	// Find what this using-declaration was referring to.
	LookupResult R(*this, NameInfo, LookupOrdinaryName);
	R.setHideTags(false);
	R.suppressDiagnostics();
	LookupQualifiedName(R, RD);

	if (R.getAsSingle<TypeDecl>()) {
	if (getLangOpts().CPlusPlus11) {
	// Convert 'using X::Y;' to 'using Y = X::Y;'.
	Diag(SS.getBeginLoc(), diag::note_using_decl_class_member_workaround)
	<< 0 // alias declaration
	<< FixItHint::CreateInsertion(SS.getBeginLoc(),
	NameInfo.getName().getAsString() +
	" = ");
	} else {
	// Convert 'using X::Y;' to 'typedef X::Y Y;'.
	SourceLocation InsertLoc = getLocForEndOfToken(NameInfo.getEndLoc());
	Diag(InsertLoc, diag::note_using_decl_class_member_workaround)
	<< 1 // typedef declaration
	<< FixItHint::CreateReplacement(UsingLoc, "typedef")
	<< FixItHint::CreateInsertion(
	InsertLoc, " " + NameInfo.getName().getAsString());
	}
	} else if (R.getAsSingle<VarDecl>()) {
	// Don't provide a fixit outside C++11 mode; we don't want to suggest
	// repeating the type of the static data member here.
	FixItHint FixIt;
	if (getLangOpts().CPlusPlus11) {
	// Convert 'using X::Y;' to 'auto &Y = X::Y;'.
	FixIt = FixItHint::CreateReplacement(
	UsingLoc, "auto &" + NameInfo.getName().getAsString() + " = ");
	}

	Diag(UsingLoc, diag::note_using_decl_class_member_workaround)
	<< 2 // reference declaration
	<< FixIt;
	} else if (R.getAsSingle<EnumConstantDecl>()) {
	// Don't provide a fixit outside C++11 mode; we don't want to suggest
	// repeating the type of the enumeration here, and we can't do so if
	// the type is anonymous.
	FixItHint FixIt;
	if (getLangOpts().CPlusPlus11) {
	// Convert 'using X::Y;' to 'auto &Y = X::Y;'.
	FixIt = FixItHint::CreateReplacement(
	UsingLoc,
	"constexpr auto " + NameInfo.getName().getAsString() + " = ");
	}

	Diag(UsingLoc, diag::note_using_decl_class_member_workaround)
	<< (getLangOpts().CPlusPlus11 ? 4 : 3) // const[expr] variable
	<< FixIt;
	}
	return true;
	}

	// Otherwise, this might be valid.
	return false;
	}

	// The current scope is a record.

	// If the named context is dependent, we can't decide much.
	if (!NamedContext) {
	// FIXME: in C++0x, we can diagnose if we can prove that the
	// nested-name-specifier does not refer to a base class, which is
	// still possible in some cases.

	// Otherwise we have to conservatively report that things might be
	// okay.
	return false;
	}

	if (!NamedContext->isRecord()) {
	// Ideally this would point at the last name in the specifier,
	// but we don't have that level of source info.
	Diag(SS.getRange().getBegin(),
	diag::err_using_decl_nested_name_specifier_is_not_class)
	<< SS.getScopeRep() << SS.getRange();
	return true;
	}

	if (!NamedContext->isDependentContext() &&
	RequireCompleteDeclContext(const_cast<CXXScopeSpec&>(SS), NamedContext))
	return true;

	if (getLangOpts().CPlusPlus11) {
	// C++11 [namespace.udecl]p3:
	// In a using-declaration used as a member-declaration, the
	// nested-name-specifier shall name a base class of the class
	// being defined.

	if (cast<CXXRecordDecl>(CurContext)->isProvablyNotDerivedFrom(
	cast<CXXRecordDecl>(NamedContext))) {
	if (CurContext == NamedContext) {
	Diag(NameLoc,
	diag::err_using_decl_nested_name_specifier_is_current_class)
	<< SS.getRange();
	return true;
	}

	if (!cast<CXXRecordDecl>(NamedContext)->isInvalidDecl()) {
	Diag(SS.getRange().getBegin(),
	diag::err_using_decl_nested_name_specifier_is_not_base_class)
	<< SS.getScopeRep()
	<< cast<CXXRecordDecl>(CurContext)
	<< SS.getRange();
	}
	return true;
	}

	return false;
	}

	// C++03 [namespace.udecl]p4:
	// A using-declaration used as a member-declaration shall refer
	// to a member of a base class of the class being defined [etc.].

	// Salient point: SS doesn't have to name a base class as long as
	// lookup only finds members from base classes. Therefore we can
	// diagnose here only if we can prove that that can't happen,
	// i.e. if the class hierarchies provably don't intersect.

	// TODO: it would be nice if "definitely valid" results were cached
	// in the UsingDecl and UsingShadowDecl so that these checks didn't
	// need to be repeated.

	llvm::SmallPtrSet<const CXXRecordDecl *, 4> Bases;
	auto Collect = [&Bases](const CXXRecordDecl *Base) {
	Bases.insert(Base);
	return true;
	};

	// Collect all bases. Return false if we find a dependent base.
	if (!cast<CXXRecordDecl>(CurContext)->forallBases(Collect))
	return false;

	// Returns true if the base is dependent or is one of the accumulated base
	// classes.
	auto IsNotBase = [&Bases](const CXXRecordDecl *Base) {
	return !Bases.count(Base);
	};

	// Return false if the class has a dependent base or if it or one
	// of its bases is present in the base set of the current context.
	if (Bases.count(cast<CXXRecordDecl>(NamedContext)) \|\|
	!cast<CXXRecordDecl>(NamedContext)->forallBases(IsNotBase))
	return false;

	Diag(SS.getRange().getBegin(),
	diag::err_using_decl_nested_name_specifier_is_not_base_class)
	<< SS.getScopeRep()
	<< cast<CXXRecordDecl>(CurContext)
	<< SS.getRange();

	return true;
	}

	Decl Sema::ActOnAliasDeclaration(Scope S, AccessSpecifier AS,
	MultiTemplateParamsArg TemplateParamLists,
	SourceLocation UsingLoc, UnqualifiedId &Name,
	const ParsedAttributesView &AttrList,
	TypeResult Type, Decl *DeclFromDeclSpec) {
	// Skip up to the relevant declaration scope.
	while (S->isTemplateParamScope())
	S = S->getParent();
	assert((S->getFlags() & Scope::DeclScope) &&
	"got alias-declaration outside of declaration scope");

	if (Type.isInvalid())
	return nullptr;

	bool Invalid = false;
	DeclarationNameInfo NameInfo = GetNameFromUnqualifiedId(Name);
	TypeSourceInfo *TInfo = nullptr;
	GetTypeFromParser(Type.get(), &TInfo);

	if (DiagnoseClassNameShadow(CurContext, NameInfo))
	return nullptr;

	if (DiagnoseUnexpandedParameterPack(Name.StartLocation, TInfo,
	UPPC_DeclarationType)) {
	Invalid = true;
	TInfo = Context.getTrivialTypeSourceInfo(Context.IntTy,
	TInfo->getTypeLoc().getBeginLoc());
	}

	LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
	TemplateParamLists.size()
	? forRedeclarationInCurContext()
	: ForVisibleRedeclaration);
	LookupName(Previous, S);

	// Warn about shadowing the name of a template parameter.
	if (Previous.isSingleResult() &&
	Previous.getFoundDecl()->isTemplateParameter()) {
	DiagnoseTemplateParameterShadow(Name.StartLocation,Previous.getFoundDecl());
	Previous.clear();
	}

	assert(Name.Kind == UnqualifiedIdKind::IK_Identifier &&
	"name in alias declaration must be an identifier");
	TypeAliasDecl *NewTD = TypeAliasDecl::Create(Context, CurContext, UsingLoc,
	Name.StartLocation,
	Name.Identifier, TInfo);

	NewTD->setAccess(AS);

	if (Invalid)
	NewTD->setInvalidDecl();

	ProcessDeclAttributeList(S, NewTD, AttrList);
	AddPragmaAttributes(S, NewTD);

	CheckTypedefForVariablyModifiedType(S, NewTD);
	Invalid \|= NewTD->isInvalidDecl();

	bool Redeclaration = false;

	NamedDecl *NewND;
	if (TemplateParamLists.size()) {
	TypeAliasTemplateDecl *OldDecl = nullptr;
	TemplateParameterList *OldTemplateParams = nullptr;

	if (TemplateParamLists.size() != 1) {
	Diag(UsingLoc, diag::err_alias_template_extra_headers)
	<< SourceRange(TemplateParamLists[1]->getTemplateLoc(),
	TemplateParamLists[TemplateParamLists.size()-1]->getRAngleLoc());
	}
	TemplateParameterList *TemplateParams = TemplateParamLists[0];

	// Check that we can declare a template here.
	if (CheckTemplateDeclScope(S, TemplateParams))
	return nullptr;

	// Only consider previous declarations in the same scope.
	FilterLookupForScope(Previous, CurContext, S, /ConsiderLinkage/false,
	/ExplicitInstantiationOrSpecialization/false);
	if (!Previous.empty()) {
	Redeclaration = true;

	OldDecl = Previous.getAsSingle<TypeAliasTemplateDecl>();
	if (!OldDecl && !Invalid) {
	Diag(UsingLoc, diag::err_redefinition_different_kind)
	<< Name.Identifier;

	NamedDecl *OldD = Previous.getRepresentativeDecl();
	if (OldD->getLocation().isValid())
	Diag(OldD->getLocation(), diag::note_previous_definition);

	Invalid = true;
	}

	if (!Invalid && OldDecl && !OldDecl->isInvalidDecl()) {
	if (TemplateParameterListsAreEqual(TemplateParams,
	OldDecl->getTemplateParameters(),
	/Complain=/true,
	TPL_TemplateMatch))
	OldTemplateParams =
	OldDecl->getMostRecentDecl()->getTemplateParameters();
	else
	Invalid = true;

	TypeAliasDecl *OldTD = OldDecl->getTemplatedDecl();
	if (!Invalid &&
	!Context.hasSameType(OldTD->getUnderlyingType(),
	NewTD->getUnderlyingType())) {
	// FIXME: The C++0x standard does not clearly say this is ill-formed,
	// but we can't reasonably accept it.
	Diag(NewTD->getLocation(), diag::err_redefinition_different_typedef)
	<< 2 << NewTD->getUnderlyingType() << OldTD->getUnderlyingType();
	if (OldTD->getLocation().isValid())
	Diag(OldTD->getLocation(), diag::note_previous_definition);
	Invalid = true;
	}
	}
	}

	// Merge any previous default template arguments into our parameters,
	// and check the parameter list.
	if (CheckTemplateParameterList(TemplateParams, OldTemplateParams,
	TPC_TypeAliasTemplate))
	return nullptr;

	TypeAliasTemplateDecl *NewDecl =
	TypeAliasTemplateDecl::Create(Context, CurContext, UsingLoc,
	Name.Identifier, TemplateParams,
	NewTD);
	NewTD->setDescribedAliasTemplate(NewDecl);

	NewDecl->setAccess(AS);

	if (Invalid)
	NewDecl->setInvalidDecl();
	else if (OldDecl) {
	NewDecl->setPreviousDecl(OldDecl);
	CheckRedeclarationModuleOwnership(NewDecl, OldDecl);
	}

	NewND = NewDecl;
	} else {
	if (auto *TD = dyn_cast_or_null<TagDecl>(DeclFromDeclSpec)) {
	setTagNameForLinkagePurposes(TD, NewTD);
	handleTagNumbering(TD, S);
	}
	ActOnTypedefNameDecl(S, CurContext, NewTD, Previous, Redeclaration);
	NewND = NewTD;
	}

	PushOnScopeChains(NewND, S);
	ActOnDocumentableDecl(NewND);
	return NewND;
	}

	Decl Sema::ActOnNamespaceAliasDef(Scope S, SourceLocation NamespaceLoc,
	SourceLocation AliasLoc,
	IdentifierInfo *Alias, CXXScopeSpec &SS,
	SourceLocation IdentLoc,
	IdentifierInfo *Ident) {

	// Lookup the namespace name.
	LookupResult R(*this, Ident, IdentLoc, LookupNamespaceName);
	LookupParsedName(R, S, &SS);

	if (R.isAmbiguous())
	return nullptr;

	if (R.empty()) {
	if (!TryNamespaceTypoCorrection(*this, R, S, SS, IdentLoc, Ident)) {
	Diag(IdentLoc, diag::err_expected_namespace_name) << SS.getRange();
	return nullptr;
	}
	}
	assert(!R.isAmbiguous() && !R.empty());
	NamedDecl *ND = R.getRepresentativeDecl();

	// Check if we have a previous declaration with the same name.
	LookupResult PrevR(*this, Alias, AliasLoc, LookupOrdinaryName,
	ForVisibleRedeclaration);
	LookupName(PrevR, S);

	// Check we're not shadowing a template parameter.
	if (PrevR.isSingleResult() && PrevR.getFoundDecl()->isTemplateParameter()) {
	DiagnoseTemplateParameterShadow(AliasLoc, PrevR.getFoundDecl());
	PrevR.clear();
	}

	// Filter out any other lookup result from an enclosing scope.
	FilterLookupForScope(PrevR, CurContext, S, /ConsiderLinkage/false,
	/AllowInlineNamespace/false);

	// Find the previous declaration and check that we can redeclare it.
	NamespaceAliasDecl *Prev = nullptr;
	if (PrevR.isSingleResult()) {
	NamedDecl *PrevDecl = PrevR.getRepresentativeDecl();
	if (NamespaceAliasDecl *AD = dyn_cast<NamespaceAliasDecl>(PrevDecl)) {
	// We already have an alias with the same name that points to the same
	// namespace; check that it matches.
	if (AD->getNamespace()->Equals(getNamespaceDecl(ND))) {
	Prev = AD;
	} else if (isVisible(PrevDecl)) {
	Diag(AliasLoc, diag::err_redefinition_different_namespace_alias)
	<< Alias;
	Diag(AD->getLocation(), diag::note_previous_namespace_alias)
	<< AD->getNamespace();
	return nullptr;
	}
	} else if (isVisible(PrevDecl)) {
	unsigned DiagID = isa<NamespaceDecl>(PrevDecl->getUnderlyingDecl())
	? diag::err_redefinition
	: diag::err_redefinition_different_kind;
	Diag(AliasLoc, DiagID) << Alias;
	Diag(PrevDecl->getLocation(), diag::note_previous_definition);
	return nullptr;
	}
	}

	// The use of a nested name specifier may trigger deprecation warnings.
	DiagnoseUseOfDecl(ND, IdentLoc);

	NamespaceAliasDecl *AliasDecl =
	NamespaceAliasDecl::Create(Context, CurContext, NamespaceLoc, AliasLoc,
	Alias, SS.getWithLocInContext(Context),
	IdentLoc, ND);
	if (Prev)
	AliasDecl->setPreviousDecl(Prev);

	PushOnScopeChains(AliasDecl, S);
	return AliasDecl;
	}

	namespace {
	struct SpecialMemberExceptionSpecInfo
	: SpecialMemberVisitor<SpecialMemberExceptionSpecInfo> {
	SourceLocation Loc;
	Sema::ImplicitExceptionSpecification ExceptSpec;

	SpecialMemberExceptionSpecInfo(Sema &S, CXXMethodDecl *MD,
	Sema::CXXSpecialMember CSM,
	Sema::InheritedConstructorInfo *ICI,
	SourceLocation Loc)
	: SpecialMemberVisitor(S, MD, CSM, ICI), Loc(Loc), ExceptSpec(S) {}

	bool visitBase(CXXBaseSpecifier *Base);
	bool visitField(FieldDecl *FD);

	void visitClassSubobject(CXXRecordDecl *Class, Subobject Subobj,
	unsigned Quals);

	void visitSubobjectCall(Subobject Subobj,
	Sema::SpecialMemberOverloadResult SMOR);
	};
	}

	bool SpecialMemberExceptionSpecInfo::visitBase(CXXBaseSpecifier *Base) {
	auto *RT = Base->getType()->getAs<RecordType>();
	if (!RT)
	return false;

	auto *BaseClass = cast<CXXRecordDecl>(RT->getDecl());
	Sema::SpecialMemberOverloadResult SMOR = lookupInheritedCtor(BaseClass);
	if (auto *BaseCtor = SMOR.getMethod()) {
	visitSubobjectCall(Base, BaseCtor);
	return false;
	}

	visitClassSubobject(BaseClass, Base, 0);
	return false;
	}

	bool SpecialMemberExceptionSpecInfo::visitField(FieldDecl *FD) {
	if (CSM == Sema::CXXDefaultConstructor && FD->hasInClassInitializer()) {
	Expr *E = FD->getInClassInitializer();
	if (!E)
	// FIXME: It's a little wasteful to build and throw away a
	// CXXDefaultInitExpr here.
	// FIXME: We should have a single context note pointing at Loc, and
	// this location should be MD->getLocation() instead, since that's
	// the location where we actually use the default init expression.
	E = S.BuildCXXDefaultInitExpr(Loc, FD).get();
	if (E)
	ExceptSpec.CalledExpr(E);
	} else if (auto *RT = S.Context.getBaseElementType(FD->getType())
	->getAs<RecordType>()) {
	visitClassSubobject(cast<CXXRecordDecl>(RT->getDecl()), FD,
	FD->getType().getCVRQualifiers());
	}
	return false;
	}

	void SpecialMemberExceptionSpecInfo::visitClassSubobject(CXXRecordDecl *Class,
	Subobject Subobj,
	unsigned Quals) {
	FieldDecl Field = Subobj.dyn_cast<FieldDecl>();
	bool IsMutable = Field && Field->isMutable();
	visitSubobjectCall(Subobj, lookupIn(Class, Quals, IsMutable));
	}

	void SpecialMemberExceptionSpecInfo::visitSubobjectCall(
	Subobject Subobj, Sema::SpecialMemberOverloadResult SMOR) {
	// Note, if lookup fails, it doesn't matter what exception specification we
	// choose because the special member will be deleted.
	if (CXXMethodDecl *MD = SMOR.getMethod())
	ExceptSpec.CalledDecl(getSubobjectLoc(Subobj), MD);
	}

	namespace {
	/// RAII object to register a special member as being currently declared.
	struct ComputingExceptionSpec {
	Sema &S;

	ComputingExceptionSpec(Sema &S, CXXMethodDecl *MD, SourceLocation Loc)
	: S(S) {
	Sema::CodeSynthesisContext Ctx;
	Ctx.Kind = Sema::CodeSynthesisContext::ExceptionSpecEvaluation;
	Ctx.PointOfInstantiation = Loc;
	Ctx.Entity = MD;
	S.pushCodeSynthesisContext(Ctx);
	}
	~ComputingExceptionSpec() {
	S.popCodeSynthesisContext();
	}
	};
	}

	bool Sema::tryResolveExplicitSpecifier(ExplicitSpecifier &ExplicitSpec) {
	llvm::APSInt Result;
	ExprResult Converted = CheckConvertedConstantExpression(
	ExplicitSpec.getExpr(), Context.BoolTy, Result, CCEK_ExplicitBool);
	ExplicitSpec.setExpr(Converted.get());
	if (Converted.isUsable() && !Converted.get()->isValueDependent()) {
	ExplicitSpec.setKind(Result.getBoolValue()
	? ExplicitSpecKind::ResolvedTrue
	: ExplicitSpecKind::ResolvedFalse);
	return true;
	}
	ExplicitSpec.setKind(ExplicitSpecKind::Unresolved);
	return false;
	}

	ExplicitSpecifier Sema::ActOnExplicitBoolSpecifier(Expr *ExplicitExpr) {
	ExplicitSpecifier ES(ExplicitExpr, ExplicitSpecKind::Unresolved);
	if (!ExplicitExpr->isTypeDependent())
	tryResolveExplicitSpecifier(ES);
	return ES;
	}

	static Sema::ImplicitExceptionSpecification
	ComputeDefaultedSpecialMemberExceptionSpec(
	Sema &S, SourceLocation Loc, CXXMethodDecl *MD, Sema::CXXSpecialMember CSM,
	Sema::InheritedConstructorInfo *ICI) {
	ComputingExceptionSpec CES(S, MD, Loc);

	CXXRecordDecl *ClassDecl = MD->getParent();

	// C++ [except.spec]p14:
	// An implicitly declared special member function (Clause 12) shall have an
	// exception-specification. [...]
	SpecialMemberExceptionSpecInfo Info(S, MD, CSM, ICI, MD->getLocation());
	if (ClassDecl->isInvalidDecl())
	return Info.ExceptSpec;

	// FIXME: If this diagnostic fires, we're probably missing a check for
	// attempting to resolve an exception specification before it's known
	// at a higher level.
	if (S.RequireCompleteType(MD->getLocation(),
	S.Context.getRecordType(ClassDecl),
	diag::err_exception_spec_incomplete_type))
	return Info.ExceptSpec;

	// C++1z [except.spec]p7:
	// [Look for exceptions thrown by] a constructor selected [...] to
	// initialize a potentially constructed subobject,
	// C++1z [except.spec]p8:
	// The exception specification for an implicitly-declared destructor, or a
	// destructor without a noexcept-specifier, is potentially-throwing if and
	// only if any of the destructors for any of its potentially constructed
	// subojects is potentially throwing.
	// FIXME: We respect the first rule but ignore the "potentially constructed"
	// in the second rule to resolve a core issue (no number yet) that would have
	// us reject:
	// struct A { virtual void f() = 0; virtual ~A() noexcept(false) = 0; };
	// struct B : A {};
	// struct C : B { void f(); };
	// ... due to giving B::~B() a non-throwing exception specification.
	Info.visit(Info.IsConstructor ? Info.VisitPotentiallyConstructedBases
	: Info.VisitAllBases);

	return Info.ExceptSpec;
	}

	namespace {
	/// RAII object to register a special member as being currently declared.
	struct DeclaringSpecialMember {
	Sema &S;
	Sema::SpecialMemberDecl D;
	Sema::ContextRAII SavedContext;
	bool WasAlreadyBeingDeclared;

	DeclaringSpecialMember(Sema &S, CXXRecordDecl *RD, Sema::CXXSpecialMember CSM)
	: S(S), D(RD, CSM), SavedContext(S, RD) {
	WasAlreadyBeingDeclared = !S.SpecialMembersBeingDeclared.insert(D).second;
	if (WasAlreadyBeingDeclared)
	// This almost never happens, but if it does, ensure that our cache
	// doesn't contain a stale result.
	S.SpecialMemberCache.clear();
	else {
	// Register a note to be produced if we encounter an error while
	// declaring the special member.
	Sema::CodeSynthesisContext Ctx;
	Ctx.Kind = Sema::CodeSynthesisContext::DeclaringSpecialMember;
	// FIXME: We don't have a location to use here. Using the class's
	// location maintains the fiction that we declare all special members
	// with the class, but (1) it's not clear that lying about that helps our
	// users understand what's going on, and (2) there may be outer contexts
	// on the stack (some of which are relevant) and printing them exposes
	// our lies.
	Ctx.PointOfInstantiation = RD->getLocation();
	Ctx.Entity = RD;
	Ctx.SpecialMember = CSM;
	S.pushCodeSynthesisContext(Ctx);
	}
	}
	~DeclaringSpecialMember() {
	if (!WasAlreadyBeingDeclared) {
	S.SpecialMembersBeingDeclared.erase(D);
	S.popCodeSynthesisContext();
	}
	}

	/// Are we already trying to declare this special member?
	bool isAlreadyBeingDeclared() const {
	return WasAlreadyBeingDeclared;
	}
	};
	}

	void Sema::CheckImplicitSpecialMemberDeclaration(Scope S, FunctionDecl FD) {
	// Look up any existing declarations, but don't trigger declaration of all
	// implicit special members with this name.
	DeclarationName Name = FD->getDeclName();
	LookupResult R(*this, Name, SourceLocation(), LookupOrdinaryName,
	ForExternalRedeclaration);
	for (auto *D : FD->getParent()->lookup(Name))
	if (auto *Acceptable = R.getAcceptableDecl(D))
	R.addDecl(Acceptable);
	R.resolveKind();
	R.suppressDiagnostics();

	CheckFunctionDeclaration(S, FD, R, /IsMemberSpecialization/false);
	}

	void Sema::setupImplicitSpecialMemberType(CXXMethodDecl *SpecialMem,
	QualType ResultTy,
	ArrayRef<QualType> Args) {
	// Build an exception specification pointing back at this constructor.
	FunctionProtoType::ExtProtoInfo EPI = getImplicitMethodEPI(*this, SpecialMem);

	if (getLangOpts().OpenCLCPlusPlus) {
	// OpenCL: Implicitly defaulted special member are of the generic address
	// space.
	EPI.TypeQuals.addAddressSpace(LangAS::opencl_generic);
	}

	auto QT = Context.getFunctionType(ResultTy, Args, EPI);
	SpecialMem->setType(QT);
	}

	CXXConstructorDecl *Sema::DeclareImplicitDefaultConstructor(
	CXXRecordDecl *ClassDecl) {
	// C++ [class.ctor]p5:
	// A default constructor for a class X is a constructor of class X
	// that can be called without an argument. If there is no
	// user-declared constructor for class X, a default constructor is
	// implicitly declared. An implicitly-declared default constructor
	// is an inline public member of its class.
	assert(ClassDecl->needsImplicitDefaultConstructor() &&
	"Should not build implicit default constructor!");

	DeclaringSpecialMember DSM(*this, ClassDecl, CXXDefaultConstructor);
	if (DSM.isAlreadyBeingDeclared())
	return nullptr;

	bool Constexpr = defaultedSpecialMemberIsConstexpr(*this, ClassDecl,
	CXXDefaultConstructor,
	false);

	// Create the actual constructor declaration.
	CanQualType ClassType
	= Context.getCanonicalType(Context.getTypeDeclType(ClassDecl));
	SourceLocation ClassLoc = ClassDecl->getLocation();
	DeclarationName Name
	= Context.DeclarationNames.getCXXConstructorName(ClassType);
	DeclarationNameInfo NameInfo(Name, ClassLoc);
	CXXConstructorDecl *DefaultCon = CXXConstructorDecl::Create(
	Context, ClassDecl, ClassLoc, NameInfo, /Type/ QualType(),
	/TInfo=/nullptr, ExplicitSpecifier(),
	/isInline=/true, /isImplicitlyDeclared=/true,
	Constexpr ? CSK_constexpr : CSK_unspecified);
	DefaultCon->setAccess(AS_public);
	DefaultCon->setDefaulted();

	if (getLangOpts().CUDA) {
	inferCUDATargetForImplicitSpecialMember(ClassDecl, CXXDefaultConstructor,
	DefaultCon,
	/* ConstRHS */ false,
	/* Diagnose */ false);
	}

	setupImplicitSpecialMemberType(DefaultCon, Context.VoidTy, None);

	// We don't need to use SpecialMemberIsTrivial here; triviality for default
	// constructors is easy to compute.
	DefaultCon->setTrivial(ClassDecl->hasTrivialDefaultConstructor());

	// Note that we have declared this constructor.
	++getASTContext().NumImplicitDefaultConstructorsDeclared;

	Scope *S = getScopeForContext(ClassDecl);
	CheckImplicitSpecialMemberDeclaration(S, DefaultCon);

	if (ShouldDeleteSpecialMember(DefaultCon, CXXDefaultConstructor))
	SetDeclDeleted(DefaultCon, ClassLoc);

	if (S)
	PushOnScopeChains(DefaultCon, S, false);
	ClassDecl->addDecl(DefaultCon);

	return DefaultCon;
	}

	void Sema::DefineImplicitDefaultConstructor(SourceLocation CurrentLocation,
	CXXConstructorDecl *Constructor) {
	assert((Constructor->isDefaulted() && Constructor->isDefaultConstructor() &&
	!Constructor->doesThisDeclarationHaveABody() &&
	!Constructor->isDeleted()) &&
	"DefineImplicitDefaultConstructor - call it for implicit default ctor");
	if (Constructor->willHaveBody() \|\| Constructor->isInvalidDecl())
	return;

	CXXRecordDecl *ClassDecl = Constructor->getParent();
	assert(ClassDecl && "DefineImplicitDefaultConstructor - invalid constructor");

	SynthesizedFunctionScope Scope(*this, Constructor);

	// The exception specification is needed because we are defining the
	// function.
	ResolveExceptionSpec(CurrentLocation,
	Constructor->getType()->castAs<FunctionProtoType>());
	MarkVTableUsed(CurrentLocation, ClassDecl);

	// Add a context note for diagnostics produced after this point.
	Scope.addContextNote(CurrentLocation);

	if (SetCtorInitializers(Constructor, /AnyErrors=/false)) {
	Constructor->setInvalidDecl();
	return;
	}

	SourceLocation Loc = Constructor->getEndLoc().isValid()
	? Constructor->getEndLoc()
	: Constructor->getLocation();
	Constructor->setBody(new (Context) CompoundStmt(Loc));
	Constructor->markUsed(Context);

	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(Constructor);
	}

	DiagnoseUninitializedFields(*this, Constructor);
	}

	void Sema::ActOnFinishDelayedMemberInitializers(Decl *D) {
	// Perform any delayed checks on exception specifications.
	CheckDelayedMemberExceptionSpecs();
	}

	/// Find or create the fake constructor we synthesize to model constructing an
	/// object of a derived class via a constructor of a base class.
	CXXConstructorDecl *
	Sema::findInheritingConstructor(SourceLocation Loc,
	CXXConstructorDecl *BaseCtor,
	ConstructorUsingShadowDecl *Shadow) {
	CXXRecordDecl *Derived = Shadow->getParent();
	SourceLocation UsingLoc = Shadow->getLocation();

	// FIXME: Add a new kind of DeclarationName for an inherited constructor.
	// For now we use the name of the base class constructor as a member of the
	// derived class to indicate a (fake) inherited constructor name.
	DeclarationName Name = BaseCtor->getDeclName();

	// Check to see if we already have a fake constructor for this inherited
	// constructor call.
	for (NamedDecl *Ctor : Derived->lookup(Name))
	if (declaresSameEntity(cast<CXXConstructorDecl>(Ctor)
	->getInheritedConstructor()
	.getConstructor(),
	BaseCtor))
	return cast<CXXConstructorDecl>(Ctor);

	DeclarationNameInfo NameInfo(Name, UsingLoc);
	TypeSourceInfo *TInfo =
	Context.getTrivialTypeSourceInfo(BaseCtor->getType(), UsingLoc);
	FunctionProtoTypeLoc ProtoLoc =
	TInfo->getTypeLoc().IgnoreParens().castAs<FunctionProtoTypeLoc>();

	// Check the inherited constructor is valid and find the list of base classes
	// from which it was inherited.
	InheritedConstructorInfo ICI(*this, Loc, Shadow);

	bool Constexpr =
	BaseCtor->isConstexpr() &&
	defaultedSpecialMemberIsConstexpr(*this, Derived, CXXDefaultConstructor,
	false, BaseCtor, &ICI);

	CXXConstructorDecl *DerivedCtor = CXXConstructorDecl::Create(
	Context, Derived, UsingLoc, NameInfo, TInfo->getType(), TInfo,
	BaseCtor->getExplicitSpecifier(), /isInline=/true,
	/isImplicitlyDeclared=/true,
	Constexpr ? BaseCtor->getConstexprKind() : CSK_unspecified,
	InheritedConstructor(Shadow, BaseCtor));
	if (Shadow->isInvalidDecl())
	DerivedCtor->setInvalidDecl();

	// Build an unevaluated exception specification for this fake constructor.
	const FunctionProtoType *FPT = TInfo->getType()->castAs<FunctionProtoType>();
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	EPI.ExceptionSpec.Type = EST_Unevaluated;
	EPI.ExceptionSpec.SourceDecl = DerivedCtor;
	DerivedCtor->setType(Context.getFunctionType(FPT->getReturnType(),
	FPT->getParamTypes(), EPI));

	// Build the parameter declarations.
	SmallVector<ParmVarDecl *, 16> ParamDecls;
	for (unsigned I = 0, N = FPT->getNumParams(); I != N; ++I) {
	TypeSourceInfo *TInfo =
	Context.getTrivialTypeSourceInfo(FPT->getParamType(I), UsingLoc);
	ParmVarDecl *PD = ParmVarDecl::Create(
	Context, DerivedCtor, UsingLoc, UsingLoc, /IdentifierInfo=/nullptr,
	FPT->getParamType(I), TInfo, SC_None, /DefArg=/nullptr);
	PD->setScopeInfo(0, I);
	PD->setImplicit();
	// Ensure attributes are propagated onto parameters (this matters for
	// format, pass_object_size, ...).
	mergeDeclAttributes(PD, BaseCtor->getParamDecl(I));
	ParamDecls.push_back(PD);
	ProtoLoc.setParam(I, PD);
	}

	// Set up the new constructor.
	assert(!BaseCtor->isDeleted() && "should not use deleted constructor");
	DerivedCtor->setAccess(BaseCtor->getAccess());
	DerivedCtor->setParams(ParamDecls);
	Derived->addDecl(DerivedCtor);

	if (ShouldDeleteSpecialMember(DerivedCtor, CXXDefaultConstructor, &ICI))
	SetDeclDeleted(DerivedCtor, UsingLoc);

	return DerivedCtor;
	}

	void Sema::NoteDeletedInheritingConstructor(CXXConstructorDecl *Ctor) {
	InheritedConstructorInfo ICI(*this, Ctor->getLocation(),
	Ctor->getInheritedConstructor().getShadowDecl());
	ShouldDeleteSpecialMember(Ctor, CXXDefaultConstructor, &ICI,
	/Diagnose/true);
	}

	void Sema::DefineInheritingConstructor(SourceLocation CurrentLocation,
	CXXConstructorDecl *Constructor) {
	CXXRecordDecl *ClassDecl = Constructor->getParent();
	assert(Constructor->getInheritedConstructor() &&
	!Constructor->doesThisDeclarationHaveABody() &&
	!Constructor->isDeleted());
	if (Constructor->willHaveBody() \|\| Constructor->isInvalidDecl())
	return;

	// Initializations are performed "as if by a defaulted default constructor",
	// so enter the appropriate scope.
	SynthesizedFunctionScope Scope(*this, Constructor);

	// The exception specification is needed because we are defining the
	// function.
	ResolveExceptionSpec(CurrentLocation,
	Constructor->getType()->castAs<FunctionProtoType>());
	MarkVTableUsed(CurrentLocation, ClassDecl);

	// Add a context note for diagnostics produced after this point.
	Scope.addContextNote(CurrentLocation);

	ConstructorUsingShadowDecl *Shadow =
	Constructor->getInheritedConstructor().getShadowDecl();
	CXXConstructorDecl *InheritedCtor =
	Constructor->getInheritedConstructor().getConstructor();

	// [class.inhctor.init]p1:
	// initialization proceeds as if a defaulted default constructor is used to
	// initialize the D object and each base class subobject from which the
	// constructor was inherited

	InheritedConstructorInfo ICI(*this, CurrentLocation, Shadow);
	CXXRecordDecl *RD = Shadow->getParent();
	SourceLocation InitLoc = Shadow->getLocation();

	// Build explicit initializers for all base classes from which the
	// constructor was inherited.
	SmallVector<CXXCtorInitializer*, 8> Inits;
	for (bool VBase : {false, true}) {
	for (CXXBaseSpecifier &B : VBase ? RD->vbases() : RD->bases()) {
	if (B.isVirtual() != VBase)
	continue;

	auto *BaseRD = B.getType()->getAsCXXRecordDecl();
	if (!BaseRD)
	continue;

	auto BaseCtor = ICI.findConstructorForBase(BaseRD, InheritedCtor);
	if (!BaseCtor.first)
	continue;

	MarkFunctionReferenced(CurrentLocation, BaseCtor.first);
	ExprResult Init = new (Context) CXXInheritedCtorInitExpr(
	InitLoc, B.getType(), BaseCtor.first, VBase, BaseCtor.second);

	auto *TInfo = Context.getTrivialTypeSourceInfo(B.getType(), InitLoc);
	Inits.push_back(new (Context) CXXCtorInitializer(
	Context, TInfo, VBase, InitLoc, Init.get(), InitLoc,
	SourceLocation()));
	}
	}

	// We now proceed as if for a defaulted default constructor, with the relevant
	// initializers replaced.

	if (SetCtorInitializers(Constructor, /AnyErrors/false, Inits)) {
	Constructor->setInvalidDecl();
	return;
	}

	Constructor->setBody(new (Context) CompoundStmt(InitLoc));
	Constructor->markUsed(Context);

	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(Constructor);
	}

	DiagnoseUninitializedFields(*this, Constructor);
	}

	CXXDestructorDecl Sema::DeclareImplicitDestructor(CXXRecordDecl ClassDecl) {
	// C++ [class.dtor]p2:
	// If a class has no user-declared destructor, a destructor is
	// declared implicitly. An implicitly-declared destructor is an
	// inline public member of its class.
	assert(ClassDecl->needsImplicitDestructor());

	DeclaringSpecialMember DSM(*this, ClassDecl, CXXDestructor);
	if (DSM.isAlreadyBeingDeclared())
	return nullptr;

	// Create the actual destructor declaration.
	CanQualType ClassType
	= Context.getCanonicalType(Context.getTypeDeclType(ClassDecl));
	SourceLocation ClassLoc = ClassDecl->getLocation();
	DeclarationName Name
	= Context.DeclarationNames.getCXXDestructorName(ClassType);
	DeclarationNameInfo NameInfo(Name, ClassLoc);
	CXXDestructorDecl *Destructor
	= CXXDestructorDecl::Create(Context, ClassDecl, ClassLoc, NameInfo,
	QualType(), nullptr, /isInline=/true,
	/isImplicitlyDeclared=/true);
	Destructor->setAccess(AS_public);
	Destructor->setDefaulted();

	if (getLangOpts().CUDA) {
	inferCUDATargetForImplicitSpecialMember(ClassDecl, CXXDestructor,
	Destructor,
	/* ConstRHS */ false,
	/* Diagnose */ false);
	}

	setupImplicitSpecialMemberType(Destructor, Context.VoidTy, None);

	// We don't need to use SpecialMemberIsTrivial here; triviality for
	// destructors is easy to compute.
	Destructor->setTrivial(ClassDecl->hasTrivialDestructor());
	Destructor->setTrivialForCall(ClassDecl->hasAttr<TrivialABIAttr>() \|\|
	ClassDecl->hasTrivialDestructorForCall());

	// Note that we have declared this destructor.
	++getASTContext().NumImplicitDestructorsDeclared;

	Scope *S = getScopeForContext(ClassDecl);
	CheckImplicitSpecialMemberDeclaration(S, Destructor);

	// We can't check whether an implicit destructor is deleted before we complete
	// the definition of the class, because its validity depends on the alignment
	// of the class. We'll check this from ActOnFields once the class is complete.
	if (ClassDecl->isCompleteDefinition() &&
	ShouldDeleteSpecialMember(Destructor, CXXDestructor))
	SetDeclDeleted(Destructor, ClassLoc);

	// Introduce this destructor into its scope.
	if (S)
	PushOnScopeChains(Destructor, S, false);
	ClassDecl->addDecl(Destructor);

	return Destructor;
	}

	void Sema::DefineImplicitDestructor(SourceLocation CurrentLocation,
	CXXDestructorDecl *Destructor) {
	assert((Destructor->isDefaulted() &&
	!Destructor->doesThisDeclarationHaveABody() &&
	!Destructor->isDeleted()) &&
	"DefineImplicitDestructor - call it for implicit default dtor");
	if (Destructor->willHaveBody() \|\| Destructor->isInvalidDecl())
	return;

	CXXRecordDecl *ClassDecl = Destructor->getParent();
	assert(ClassDecl && "DefineImplicitDestructor - invalid destructor");

	SynthesizedFunctionScope Scope(*this, Destructor);

	// The exception specification is needed because we are defining the
	// function.
	ResolveExceptionSpec(CurrentLocation,
	Destructor->getType()->castAs<FunctionProtoType>());
	MarkVTableUsed(CurrentLocation, ClassDecl);

	// Add a context note for diagnostics produced after this point.
	Scope.addContextNote(CurrentLocation);

	MarkBaseAndMemberDestructorsReferenced(Destructor->getLocation(),
	Destructor->getParent());

	if (CheckDestructor(Destructor)) {
	Destructor->setInvalidDecl();
	return;
	}

	SourceLocation Loc = Destructor->getEndLoc().isValid()
	? Destructor->getEndLoc()
	: Destructor->getLocation();
	Destructor->setBody(new (Context) CompoundStmt(Loc));
	Destructor->markUsed(Context);

	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(Destructor);
	}
	}

	/// Perform any semantic analysis which needs to be delayed until all
	/// pending class member declarations have been parsed.
	void Sema::ActOnFinishCXXMemberDecls() {
	// If the context is an invalid C++ class, just suppress these checks.
	if (CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(CurContext)) {
	if (Record->isInvalidDecl()) {
	DelayedOverridingExceptionSpecChecks.clear();
	DelayedEquivalentExceptionSpecChecks.clear();
	return;
	}
	checkForMultipleExportedDefaultConstructors(*this, Record);
	}
	}

	void Sema::ActOnFinishCXXNonNestedClass(Decl *D) {
	referenceDLLExportedClassMethods();

	if (!DelayedDllExportMemberFunctions.empty()) {
	SmallVector<CXXMethodDecl*, 4> WorkList;
	std::swap(DelayedDllExportMemberFunctions, WorkList);
	for (CXXMethodDecl *M : WorkList) {
	DefineImplicitSpecialMember(*this, M, M->getLocation());

	// Pass the method to the consumer to get emitted. This is not necessary
	// for explicit instantiation definitions, as they will get emitted
	// anyway.
	if (M->getParent()->getTemplateSpecializationKind() !=
	TSK_ExplicitInstantiationDefinition)
	ActOnFinishInlineFunctionDef(M);
	}
	}
	}

	void Sema::referenceDLLExportedClassMethods() {
	if (!DelayedDllExportClasses.empty()) {
	// Calling ReferenceDllExportedMembers might cause the current function to
	// be called again, so use a local copy of DelayedDllExportClasses.
	SmallVector<CXXRecordDecl *, 4> WorkList;
	std::swap(DelayedDllExportClasses, WorkList);
	for (CXXRecordDecl *Class : WorkList)
	ReferenceDllExportedMembers(*this, Class);
	}
	}

	void Sema::AdjustDestructorExceptionSpec(CXXDestructorDecl *Destructor) {
	assert(getLangOpts().CPlusPlus11 &&
	"adjusting dtor exception specs was introduced in c++11");

	if (Destructor->isDependentContext())
	return;

	// C++11 [class.dtor]p3:
	// A declaration of a destructor that does not have an exception-
	// specification is implicitly considered to have the same exception-
	// specification as an implicit declaration.
	const FunctionProtoType *DtorType = Destructor->getType()->
	getAs<FunctionProtoType>();
	if (DtorType->hasExceptionSpec())
	return;

	// Replace the destructor's type, building off the existing one. Fortunately,
	// the only thing of interest in the destructor type is its extended info.
	// The return and arguments are fixed.
	FunctionProtoType::ExtProtoInfo EPI = DtorType->getExtProtoInfo();
	EPI.ExceptionSpec.Type = EST_Unevaluated;
	EPI.ExceptionSpec.SourceDecl = Destructor;
	Destructor->setType(Context.getFunctionType(Context.VoidTy, None, EPI));

	// FIXME: If the destructor has a body that could throw, and the newly created
	// spec doesn't allow exceptions, we should emit a warning, because this
	// change in behavior can break conforming C++03 programs at runtime.
	// However, we don't have a body or an exception specification yet, so it
	// needs to be done somewhere else.
	}

	namespace {
	/// An abstract base class for all helper classes used in building the
	// copy/move operators. These classes serve as factory functions and help us
	// avoid using the same Expr* in the AST twice.
	class ExprBuilder {
	ExprBuilder(const ExprBuilder&) = delete;
	ExprBuilder &operator=(const ExprBuilder&) = delete;

	protected:
	static Expr assertNotNull(Expr E) {
	assert(E && "Expression construction must not fail.");
	return E;
	}

	public:
	ExprBuilder() {}
	virtual ~ExprBuilder() {}

	virtual Expr *build(Sema &S, SourceLocation Loc) const = 0;
	};

	class RefBuilder: public ExprBuilder {
	VarDecl *Var;
	QualType VarType;

	public:
	Expr *build(Sema &S, SourceLocation Loc) const override {
	return assertNotNull(S.BuildDeclRefExpr(Var, VarType, VK_LValue, Loc));
	}

	RefBuilder(VarDecl *Var, QualType VarType)
	: Var(Var), VarType(VarType) {}
	};

	class ThisBuilder: public ExprBuilder {
	public:
	Expr *build(Sema &S, SourceLocation Loc) const override {
	return assertNotNull(S.ActOnCXXThis(Loc).getAs<Expr>());
	}
	};

	class CastBuilder: public ExprBuilder {
	const ExprBuilder &Builder;
	QualType Type;
	ExprValueKind Kind;
	const CXXCastPath &Path;

	public:
	Expr *build(Sema &S, SourceLocation Loc) const override {
	return assertNotNull(S.ImpCastExprToType(Builder.build(S, Loc), Type,
	CK_UncheckedDerivedToBase, Kind,
	&Path).get());
	}

	CastBuilder(const ExprBuilder &Builder, QualType Type, ExprValueKind Kind,
	const CXXCastPath &Path)
	: Builder(Builder), Type(Type), Kind(Kind), Path(Path) {}
	};

	class DerefBuilder: public ExprBuilder {
	const ExprBuilder &Builder;

	public:
	Expr *build(Sema &S, SourceLocation Loc) const override {
	return assertNotNull(
	S.CreateBuiltinUnaryOp(Loc, UO_Deref, Builder.build(S, Loc)).get());
	}

	DerefBuilder(const ExprBuilder &Builder) : Builder(Builder) {}
	};

	class MemberBuilder: public ExprBuilder {
	const ExprBuilder &Builder;
	QualType Type;
	CXXScopeSpec SS;
	bool IsArrow;
	LookupResult &MemberLookup;

	public:
	Expr *build(Sema &S, SourceLocation Loc) const override {
	return assertNotNull(S.BuildMemberReferenceExpr(
	Builder.build(S, Loc), Type, Loc, IsArrow, SS, SourceLocation(),
	nullptr, MemberLookup, nullptr, nullptr).get());
	}

	MemberBuilder(const ExprBuilder &Builder, QualType Type, bool IsArrow,
	LookupResult &MemberLookup)
	: Builder(Builder), Type(Type), IsArrow(IsArrow),
	MemberLookup(MemberLookup) {}
	};

	class MoveCastBuilder: public ExprBuilder {
	const ExprBuilder &Builder;

	public:
	Expr *build(Sema &S, SourceLocation Loc) const override {
	return assertNotNull(CastForMoving(S, Builder.build(S, Loc)));
	}

	MoveCastBuilder(const ExprBuilder &Builder) : Builder(Builder) {}
	};

	class LvalueConvBuilder: public ExprBuilder {
	const ExprBuilder &Builder;

	public:
	Expr *build(Sema &S, SourceLocation Loc) const override {
	return assertNotNull(
	S.DefaultLvalueConversion(Builder.build(S, Loc)).get());
	}

	LvalueConvBuilder(const ExprBuilder &Builder) : Builder(Builder) {}
	};

	class SubscriptBuilder: public ExprBuilder {
	const ExprBuilder &Base;
	const ExprBuilder &Index;

	public:
	Expr *build(Sema &S, SourceLocation Loc) const override {
	return assertNotNull(S.CreateBuiltinArraySubscriptExpr(
	Base.build(S, Loc), Loc, Index.build(S, Loc), Loc).get());
	}

	SubscriptBuilder(const ExprBuilder &Base, const ExprBuilder &Index)
	: Base(Base), Index(Index) {}
	};

	} // end anonymous namespace

	/// When generating a defaulted copy or move assignment operator, if a field
	/// should be copied with __builtin_memcpy rather than via explicit assignments,
	/// do so. This optimization only applies for arrays of scalars, and for arrays
	/// of class type where the selected copy/move-assignment operator is trivial.
	static StmtResult
	buildMemcpyForAssignmentOp(Sema &S, SourceLocation Loc, QualType T,
	const ExprBuilder &ToB, const ExprBuilder &FromB) {
	// Compute the size of the memory buffer to be copied.
	QualType SizeType = S.Context.getSizeType();
	llvm::APInt Size(S.Context.getTypeSize(SizeType),
	S.Context.getTypeSizeInChars(T).getQuantity());

	// Take the address of the field references for "from" and "to". We
	// directly construct UnaryOperators here because semantic analysis
	// does not permit us to take the address of an xvalue.
	Expr *From = FromB.build(S, Loc);
	From = new (S.Context) UnaryOperator(From, UO_AddrOf,
	S.Context.getPointerType(From->getType()),
	VK_RValue, OK_Ordinary, Loc, false);
	Expr *To = ToB.build(S, Loc);
	To = new (S.Context) UnaryOperator(To, UO_AddrOf,
	S.Context.getPointerType(To->getType()),
	VK_RValue, OK_Ordinary, Loc, false);

	const Type *E = T->getBaseElementTypeUnsafe();
	bool NeedsCollectableMemCpy =
	E->isRecordType() && E->getAs<RecordType>()->getDecl()->hasObjectMember();

	// Create a reference to the __builtin_objc_memmove_collectable function
	StringRef MemCpyName = NeedsCollectableMemCpy ?
	"__builtin_objc_memmove_collectable" :
	"__builtin_memcpy";
	LookupResult R(S, &S.Context.Idents.get(MemCpyName), Loc,
	Sema::LookupOrdinaryName);
	S.LookupName(R, S.TUScope, true);

	FunctionDecl *MemCpy = R.getAsSingle<FunctionDecl>();
	if (!MemCpy)
	// Something went horribly wrong earlier, and we will have complained
	// about it.
	return StmtError();

	ExprResult MemCpyRef = S.BuildDeclRefExpr(MemCpy, S.Context.BuiltinFnTy,
	VK_RValue, Loc, nullptr);
	assert(MemCpyRef.isUsable() && "Builtin reference cannot fail");

	Expr *CallArgs[] = {
	To, From, IntegerLiteral::Create(S.Context, Size, SizeType, Loc)
	};
	ExprResult Call = S.BuildCallExpr(/Scope=/nullptr, MemCpyRef.get(),
	Loc, CallArgs, Loc);

	assert(!Call.isInvalid() && "Call to __builtin_memcpy cannot fail!");
	return Call.getAs<Stmt>();
	}

	/// Builds a statement that copies/moves the given entity from \p From to
	/// \c To.
	///
	/// This routine is used to copy/move the members of a class with an
	/// implicitly-declared copy/move assignment operator. When the entities being
	/// copied are arrays, this routine builds for loops to copy them.
	///
	/// \param S The Sema object used for type-checking.
	///
	/// \param Loc The location where the implicit copy/move is being generated.
	///
	/// \param T The type of the expressions being copied/moved. Both expressions
	/// must have this type.
	///
	/// \param To The expression we are copying/moving to.
	///
	/// \param From The expression we are copying/moving from.
	///
	/// \param CopyingBaseSubobject Whether we're copying/moving a base subobject.
	/// Otherwise, it's a non-static member subobject.
	///
	/// \param Copying Whether we're copying or moving.
	///
	/// \param Depth Internal parameter recording the depth of the recursion.
	///
	/// \returns A statement or a loop that copies the expressions, or StmtResult(0)
	/// if a memcpy should be used instead.
	static StmtResult
	buildSingleCopyAssignRecursively(Sema &S, SourceLocation Loc, QualType T,
	const ExprBuilder &To, const ExprBuilder &From,
	bool CopyingBaseSubobject, bool Copying,
	unsigned Depth = 0) {
	// C++11 [class.copy]p28:
	// Each subobject is assigned in the manner appropriate to its type:
	//
	// - if the subobject is of class type, as if by a call to operator= with
	// the subobject as the object expression and the corresponding
	// subobject of x as a single function argument (as if by explicit
	// qualification; that is, ignoring any possible virtual overriding
	// functions in more derived classes);
	//
	// C++03 [class.copy]p13:
	// - if the subobject is of class type, the copy assignment operator for
	// the class is used (as if by explicit qualification; that is,
	// ignoring any possible virtual overriding functions in more derived
	// classes);
	if (const RecordType *RecordTy = T->getAs<RecordType>()) {
	CXXRecordDecl *ClassDecl = cast<CXXRecordDecl>(RecordTy->getDecl());

	// Look for operator=.
	DeclarationName Name
	= S.Context.DeclarationNames.getCXXOperatorName(OO_Equal);
	LookupResult OpLookup(S, Name, Loc, Sema::LookupOrdinaryName);
	S.LookupQualifiedName(OpLookup, ClassDecl, false);

	// Prior to C++11, filter out any result that isn't a copy/move-assignment
	// operator.
	if (!S.getLangOpts().CPlusPlus11) {
	LookupResult::Filter F = OpLookup.makeFilter();
	while (F.hasNext()) {
	NamedDecl *D = F.next();
	if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D))
	if (Method->isCopyAssignmentOperator() \|\|
	(!Copying && Method->isMoveAssignmentOperator()))
	continue;

	F.erase();
	}
	F.done();
	}

	// Suppress the protected check (C++ [class.protected]) for each of the
	// assignment operators we found. This strange dance is required when
	// we're assigning via a base classes's copy-assignment operator. To
	// ensure that we're getting the right base class subobject (without
	// ambiguities), we need to cast "this" to that subobject type; to
	// ensure that we don't go through the virtual call mechanism, we need
	// to qualify the operator= name with the base class (see below). However,
	// this means that if the base class has a protected copy assignment
	// operator, the protected member access check will fail. So, we
	// rewrite "protected" access to "public" access in this case, since we
	// know by construction that we're calling from a derived class.
	if (CopyingBaseSubobject) {
	for (LookupResult::iterator L = OpLookup.begin(), LEnd = OpLookup.end();
	L != LEnd; ++L) {
	if (L.getAccess() == AS_protected)
	L.setAccess(AS_public);
	}
	}

	// Create the nested-name-specifier that will be used to qualify the
	// reference to operator=; this is required to suppress the virtual
	// call mechanism.
	CXXScopeSpec SS;
	const Type *CanonicalT = S.Context.getCanonicalType(T.getTypePtr());
	SS.MakeTrivial(S.Context,
	NestedNameSpecifier::Create(S.Context, nullptr, false,
	CanonicalT),
	Loc);

	// Create the reference to operator=.
	ExprResult OpEqualRef
	= S.BuildMemberReferenceExpr(To.build(S, Loc), T, Loc, /IsArrow=/false,
	SS, /TemplateKWLoc=/SourceLocation(),
	/FirstQualifierInScope=/nullptr,
	OpLookup,
	/TemplateArgs=/nullptr, /S/nullptr,
	/SuppressQualifierCheck=/true);
	if (OpEqualRef.isInvalid())
	return StmtError();

	// Build the call to the assignment operator.

	Expr *FromInst = From.build(S, Loc);
	ExprResult Call = S.BuildCallToMemberFunction(/Scope=/nullptr,
	OpEqualRef.getAs<Expr>(),
	Loc, FromInst, Loc);
	if (Call.isInvalid())
	return StmtError();

	// If we built a call to a trivial 'operator=' while copying an array,
	// bail out. We'll replace the whole shebang with a memcpy.
	CXXMemberCallExpr *CE = dyn_cast<CXXMemberCallExpr>(Call.get());
	if (CE && CE->getMethodDecl()->isTrivial() && Depth)
	return StmtResult((Stmt*)nullptr);

	// Convert to an expression-statement, and clean up any produced
	// temporaries.
	return S.ActOnExprStmt(Call);
	}

	// - if the subobject is of scalar type, the built-in assignment
	// operator is used.
	const ConstantArrayType *ArrayTy = S.Context.getAsConstantArrayType(T);
	if (!ArrayTy) {
	ExprResult Assignment = S.CreateBuiltinBinOp(
	Loc, BO_Assign, To.build(S, Loc), From.build(S, Loc));
	if (Assignment.isInvalid())
	return StmtError();
	return S.ActOnExprStmt(Assignment);
	}

	// - if the subobject is an array, each element is assigned, in the
	// manner appropriate to the element type;

	// Construct a loop over the array bounds, e.g.,
	//
	// for (__SIZE_TYPE__ i0 = 0; i0 != array-size; ++i0)
	//
	// that will copy each of the array elements.
	QualType SizeType = S.Context.getSizeType();

	// Create the iteration variable.
	IdentifierInfo *IterationVarName = nullptr;
	{
	SmallString<8> Str;
	llvm::raw_svector_ostream OS(Str);
	OS << "__i" << Depth;
	IterationVarName = &S.Context.Idents.get(OS.str());
	}
	VarDecl *IterationVar = VarDecl::Create(S.Context, S.CurContext, Loc, Loc,
	IterationVarName, SizeType,
	S.Context.getTrivialTypeSourceInfo(SizeType, Loc),
	SC_None);

	// Initialize the iteration variable to zero.
	llvm::APInt Zero(S.Context.getTypeSize(SizeType), 0);
	IterationVar->setInit(IntegerLiteral::Create(S.Context, Zero, SizeType, Loc));

	// Creates a reference to the iteration variable.
	RefBuilder IterationVarRef(IterationVar, SizeType);
	LvalueConvBuilder IterationVarRefRVal(IterationVarRef);

	// Create the DeclStmt that holds the iteration variable.
	Stmt *InitStmt = new (S.Context) DeclStmt(DeclGroupRef(IterationVar),Loc,Loc);

	// Subscript the "from" and "to" expressions with the iteration variable.
	SubscriptBuilder FromIndexCopy(From, IterationVarRefRVal);
	MoveCastBuilder FromIndexMove(FromIndexCopy);
	const ExprBuilder *FromIndex;
	if (Copying)
	FromIndex = &FromIndexCopy;
	else
	FromIndex = &FromIndexMove;

	SubscriptBuilder ToIndex(To, IterationVarRefRVal);

	// Build the copy/move for an individual element of the array.
	StmtResult Copy =
	buildSingleCopyAssignRecursively(S, Loc, ArrayTy->getElementType(),
	ToIndex, *FromIndex, CopyingBaseSubobject,
	Copying, Depth + 1);
	// Bail out if copying fails or if we determined that we should use memcpy.
	if (Copy.isInvalid() \|\| !Copy.get())
	return Copy;

	// Create the comparison against the array bound.
	llvm::APInt Upper
	= ArrayTy->getSize().zextOrTrunc(S.Context.getTypeSize(SizeType));
	Expr *Comparison
	= new (S.Context) BinaryOperator(IterationVarRefRVal.build(S, Loc),
	IntegerLiteral::Create(S.Context, Upper, SizeType, Loc),
	BO_NE, S.Context.BoolTy,
	VK_RValue, OK_Ordinary, Loc, FPOptions());

	// Create the pre-increment of the iteration variable. We can determine
	// whether the increment will overflow based on the value of the array
	// bound.
	Expr *Increment = new (S.Context)
	UnaryOperator(IterationVarRef.build(S, Loc), UO_PreInc, SizeType,
	VK_LValue, OK_Ordinary, Loc, Upper.isMaxValue());

	// Construct the loop that copies all elements of this array.
	return S.ActOnForStmt(
	Loc, Loc, InitStmt,
	S.ActOnCondition(nullptr, Loc, Comparison, Sema::ConditionKind::Boolean),
	S.MakeFullDiscardedValueExpr(Increment), Loc, Copy.get());
	}

	static StmtResult
	buildSingleCopyAssign(Sema &S, SourceLocation Loc, QualType T,
	const ExprBuilder &To, const ExprBuilder &From,
	bool CopyingBaseSubobject, bool Copying) {
	// Maybe we should use a memcpy?
	if (T->isArrayType() && !T.isConstQualified() && !T.isVolatileQualified() &&
	T.isTriviallyCopyableType(S.Context))
	return buildMemcpyForAssignmentOp(S, Loc, T, To, From);

	StmtResult Result(buildSingleCopyAssignRecursively(S, Loc, T, To, From,
	CopyingBaseSubobject,
	Copying, 0));

	// If we ended up picking a trivial assignment operator for an array of a
	// non-trivially-copyable class type, just emit a memcpy.
	if (!Result.isInvalid() && !Result.get())
	return buildMemcpyForAssignmentOp(S, Loc, T, To, From);

	return Result;
	}

	CXXMethodDecl Sema::DeclareImplicitCopyAssignment(CXXRecordDecl ClassDecl) {
	// Note: The following rules are largely analoguous to the copy
	// constructor rules. Note that virtual bases are not taken into account
	// for determining the argument type of the operator. Note also that
	// operators taking an object instead of a reference are allowed.
	assert(ClassDecl->needsImplicitCopyAssignment());

	DeclaringSpecialMember DSM(*this, ClassDecl, CXXCopyAssignment);
	if (DSM.isAlreadyBeingDeclared())
	return nullptr;

	QualType ArgType = Context.getTypeDeclType(ClassDecl);
	if (Context.getLangOpts().OpenCLCPlusPlus)
	ArgType = Context.getAddrSpaceQualType(ArgType, LangAS::opencl_generic);
	QualType RetType = Context.getLValueReferenceType(ArgType);
	bool Const = ClassDecl->implicitCopyAssignmentHasConstParam();
	if (Const)
	ArgType = ArgType.withConst();

	ArgType = Context.getLValueReferenceType(ArgType);

	bool Constexpr = defaultedSpecialMemberIsConstexpr(*this, ClassDecl,
	CXXCopyAssignment,
	Const);

	// An implicitly-declared copy assignment operator is an inline public
	// member of its class.
	DeclarationName Name = Context.DeclarationNames.getCXXOperatorName(OO_Equal);
	SourceLocation ClassLoc = ClassDecl->getLocation();
	DeclarationNameInfo NameInfo(Name, ClassLoc);
	CXXMethodDecl *CopyAssignment = CXXMethodDecl::Create(
	Context, ClassDecl, ClassLoc, NameInfo, QualType(),
	/TInfo=/nullptr, /StorageClass=/SC_None,
	/isInline=/true, Constexpr ? CSK_constexpr : CSK_unspecified,
	SourceLocation());
	CopyAssignment->setAccess(AS_public);
	CopyAssignment->setDefaulted();
	CopyAssignment->setImplicit();

	if (getLangOpts().CUDA) {
	inferCUDATargetForImplicitSpecialMember(ClassDecl, CXXCopyAssignment,
	CopyAssignment,
	/* ConstRHS */ Const,
	/* Diagnose */ false);
	}

	setupImplicitSpecialMemberType(CopyAssignment, RetType, ArgType);

	// Add the parameter to the operator.
	ParmVarDecl *FromParam = ParmVarDecl::Create(Context, CopyAssignment,
	ClassLoc, ClassLoc,
	/Id=/nullptr, ArgType,
	/TInfo=/nullptr, SC_None,
	nullptr);
	CopyAssignment->setParams(FromParam);

	CopyAssignment->setTrivial(
	ClassDecl->needsOverloadResolutionForCopyAssignment()
	? SpecialMemberIsTrivial(CopyAssignment, CXXCopyAssignment)
	: ClassDecl->hasTrivialCopyAssignment());

	// Note that we have added this copy-assignment operator.
	++getASTContext().NumImplicitCopyAssignmentOperatorsDeclared;

	Scope *S = getScopeForContext(ClassDecl);
	CheckImplicitSpecialMemberDeclaration(S, CopyAssignment);

	if (ShouldDeleteSpecialMember(CopyAssignment, CXXCopyAssignment))
	SetDeclDeleted(CopyAssignment, ClassLoc);

	if (S)
	PushOnScopeChains(CopyAssignment, S, false);
	ClassDecl->addDecl(CopyAssignment);

	return CopyAssignment;
	}

	/// Diagnose an implicit copy operation for a class which is odr-used, but
	/// which is deprecated because the class has a user-declared copy constructor,
	/// copy assignment operator, or destructor.
	static void diagnoseDeprecatedCopyOperation(Sema &S, CXXMethodDecl *CopyOp) {
	assert(CopyOp->isImplicit());

	CXXRecordDecl *RD = CopyOp->getParent();
	CXXMethodDecl *UserDeclaredOperation = nullptr;

	// In Microsoft mode, assignment operations don't affect constructors and
	// vice versa.
	if (RD->hasUserDeclaredDestructor()) {
	UserDeclaredOperation = RD->getDestructor();
	} else if (!isa<CXXConstructorDecl>(CopyOp) &&
	RD->hasUserDeclaredCopyConstructor() &&
	!S.getLangOpts().MSVCCompat) {
	// Find any user-declared copy constructor.
	for (auto *I : RD->ctors()) {
	if (I->isCopyConstructor()) {
	UserDeclaredOperation = I;
	break;
	}
	}
	assert(UserDeclaredOperation);
	} else if (isa<CXXConstructorDecl>(CopyOp) &&
	RD->hasUserDeclaredCopyAssignment() &&
	!S.getLangOpts().MSVCCompat) {
	// Find any user-declared move assignment operator.
	for (auto *I : RD->methods()) {
	if (I->isCopyAssignmentOperator()) {
	UserDeclaredOperation = I;
	break;
	}
	}
	assert(UserDeclaredOperation);
	}

	if (UserDeclaredOperation) {
	S.Diag(UserDeclaredOperation->getLocation(),
	diag::warn_deprecated_copy_operation)
	<< RD << /copy assignment/!isa<CXXConstructorDecl>(CopyOp)
	<< /destructor/isa<CXXDestructorDecl>(UserDeclaredOperation);
	}
	}

	void Sema::DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
	CXXMethodDecl *CopyAssignOperator) {
	assert((CopyAssignOperator->isDefaulted() &&
	CopyAssignOperator->isOverloadedOperator() &&
	CopyAssignOperator->getOverloadedOperator() == OO_Equal &&
	!CopyAssignOperator->doesThisDeclarationHaveABody() &&
	!CopyAssignOperator->isDeleted()) &&
	"DefineImplicitCopyAssignment called for wrong function");
	if (CopyAssignOperator->willHaveBody() \|\| CopyAssignOperator->isInvalidDecl())
	return;

	CXXRecordDecl *ClassDecl = CopyAssignOperator->getParent();
	if (ClassDecl->isInvalidDecl()) {
	CopyAssignOperator->setInvalidDecl();
	return;
	}

	SynthesizedFunctionScope Scope(*this, CopyAssignOperator);

	// The exception specification is needed because we are defining the
	// function.
	ResolveExceptionSpec(CurrentLocation,
	CopyAssignOperator->getType()->castAs<FunctionProtoType>());

	// Add a context note for diagnostics produced after this point.
	Scope.addContextNote(CurrentLocation);

	// C++11 [class.copy]p18:
	// The [definition of an implicitly declared copy assignment operator] is
	// deprecated if the class has a user-declared copy constructor or a
	// user-declared destructor.
	if (getLangOpts().CPlusPlus11 && CopyAssignOperator->isImplicit())
	diagnoseDeprecatedCopyOperation(*this, CopyAssignOperator);

	// C++0x [class.copy]p30:
	// The implicitly-defined or explicitly-defaulted copy assignment operator
	// for a non-union class X performs memberwise copy assignment of its
	// subobjects. The direct base classes of X are assigned first, in the
	// order of their declaration in the base-specifier-list, and then the
	// immediate non-static data members of X are assigned, in the order in
	// which they were declared in the class definition.

	// The statements that form the synthesized function body.
	SmallVector<Stmt*, 8> Statements;

	// The parameter for the "other" object, which we are copying from.
	ParmVarDecl *Other = CopyAssignOperator->getParamDecl(0);
	Qualifiers OtherQuals = Other->getType().getQualifiers();
	QualType OtherRefType = Other->getType();
	if (const LValueReferenceType *OtherRef
	= OtherRefType->getAs<LValueReferenceType>()) {
	OtherRefType = OtherRef->getPointeeType();
	OtherQuals = OtherRefType.getQualifiers();
	}

	// Our location for everything implicitly-generated.
	SourceLocation Loc = CopyAssignOperator->getEndLoc().isValid()
	? CopyAssignOperator->getEndLoc()
	: CopyAssignOperator->getLocation();

	// Builds a DeclRefExpr for the "other" object.
	RefBuilder OtherRef(Other, OtherRefType);

	// Builds the "this" pointer.
	ThisBuilder This;

	// Assign base classes.
	bool Invalid = false;
	for (auto &Base : ClassDecl->bases()) {
	// Form the assignment:
	// static_cast<Base*>(this)->Base::operator=(static_cast<Base&>(other));
	QualType BaseType = Base.getType().getUnqualifiedType();
	if (!BaseType->isRecordType()) {
	Invalid = true;
	continue;
	}

	CXXCastPath BasePath;
	BasePath.push_back(&Base);

	// Construct the "from" expression, which is an implicit cast to the
	// appropriately-qualified base type.
	CastBuilder From(OtherRef, Context.getQualifiedType(BaseType, OtherQuals),
	VK_LValue, BasePath);

	// Dereference "this".
	DerefBuilder DerefThis(This);
	CastBuilder To(DerefThis,
	Context.getQualifiedType(
	BaseType, CopyAssignOperator->getMethodQualifiers()),
	VK_LValue, BasePath);

	// Build the copy.
	StmtResult Copy = buildSingleCopyAssign(*this, Loc, BaseType,
	To, From,
	/CopyingBaseSubobject=/true,
	/Copying=/true);
	if (Copy.isInvalid()) {
	CopyAssignOperator->setInvalidDecl();
	return;
	}

	// Success! Record the copy.
	Statements.push_back(Copy.getAs<Expr>());
	}

	// Assign non-static members.
	for (auto *Field : ClassDecl->fields()) {
	// FIXME: We should form some kind of AST representation for the implied
	// memcpy in a union copy operation.
	if (Field->isUnnamedBitfield() \|\| Field->getParent()->isUnion())
	continue;

	if (Field->isInvalidDecl()) {
	Invalid = true;
	continue;
	}

	// Check for members of reference type; we can't copy those.
	if (Field->getType()->isReferenceType()) {
	Diag(ClassDecl->getLocation(), diag::err_uninitialized_member_for_assign)
	<< Context.getTagDeclType(ClassDecl) << 0 << Field->getDeclName();
	Diag(Field->getLocation(), diag::note_declared_at);
	Invalid = true;
	continue;
	}

	// Check for members of const-qualified, non-class type.
	QualType BaseType = Context.getBaseElementType(Field->getType());
	if (!BaseType->getAs<RecordType>() && BaseType.isConstQualified()) {
	Diag(ClassDecl->getLocation(), diag::err_uninitialized_member_for_assign)
	<< Context.getTagDeclType(ClassDecl) << 1 << Field->getDeclName();
	Diag(Field->getLocation(), diag::note_declared_at);
	Invalid = true;
	continue;
	}

	// Suppress assigning zero-width bitfields.
	if (Field->isZeroLengthBitField(Context))
	continue;

	QualType FieldType = Field->getType().getNonReferenceType();
	if (FieldType->isIncompleteArrayType()) {
	assert(ClassDecl->hasFlexibleArrayMember() &&
	"Incomplete array type is not valid");
	continue;
	}

	// Build references to the field in the object we're copying from and to.
	CXXScopeSpec SS; // Intentionally empty
	LookupResult MemberLookup(*this, Field->getDeclName(), Loc,
	LookupMemberName);
	MemberLookup.addDecl(Field);
	MemberLookup.resolveKind();

	MemberBuilder From(OtherRef, OtherRefType, /IsArrow=/false, MemberLookup);

	MemberBuilder To(This, getCurrentThisType(), /IsArrow=/true, MemberLookup);

	// Build the copy of this field.
	StmtResult Copy = buildSingleCopyAssign(*this, Loc, FieldType,
	To, From,
	/CopyingBaseSubobject=/false,
	/Copying=/true);
	if (Copy.isInvalid()) {
	CopyAssignOperator->setInvalidDecl();
	return;
	}

	// Success! Record the copy.
	Statements.push_back(Copy.getAs<Stmt>());
	}

	if (!Invalid) {
	// Add a "return *this;"
	ExprResult ThisObj = CreateBuiltinUnaryOp(Loc, UO_Deref, This.build(*this, Loc));

	StmtResult Return = BuildReturnStmt(Loc, ThisObj.get());
	if (Return.isInvalid())
	Invalid = true;
	else
	Statements.push_back(Return.getAs<Stmt>());
	}

	if (Invalid) {
	CopyAssignOperator->setInvalidDecl();
	return;
	}

	StmtResult Body;
	{
	CompoundScopeRAII CompoundScope(*this);
	Body = ActOnCompoundStmt(Loc, Loc, Statements,
	/isStmtExpr=/false);
	assert(!Body.isInvalid() && "Compound statement creation cannot fail");
	}
	CopyAssignOperator->setBody(Body.getAs<Stmt>());
	CopyAssignOperator->markUsed(Context);

	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(CopyAssignOperator);
	}
	}

	CXXMethodDecl Sema::DeclareImplicitMoveAssignment(CXXRecordDecl ClassDecl) {
	assert(ClassDecl->needsImplicitMoveAssignment());

	DeclaringSpecialMember DSM(*this, ClassDecl, CXXMoveAssignment);
	if (DSM.isAlreadyBeingDeclared())
	return nullptr;

	// Note: The following rules are largely analoguous to the move
	// constructor rules.

	QualType ArgType = Context.getTypeDeclType(ClassDecl);
	if (Context.getLangOpts().OpenCLCPlusPlus)
	ArgType = Context.getAddrSpaceQualType(ArgType, LangAS::opencl_generic);
	QualType RetType = Context.getLValueReferenceType(ArgType);
	ArgType = Context.getRValueReferenceType(ArgType);

	bool Constexpr = defaultedSpecialMemberIsConstexpr(*this, ClassDecl,
	CXXMoveAssignment,
	false);

	// An implicitly-declared move assignment operator is an inline public
	// member of its class.
	DeclarationName Name = Context.DeclarationNames.getCXXOperatorName(OO_Equal);
	SourceLocation ClassLoc = ClassDecl->getLocation();
	DeclarationNameInfo NameInfo(Name, ClassLoc);
	CXXMethodDecl *MoveAssignment = CXXMethodDecl::Create(
	Context, ClassDecl, ClassLoc, NameInfo, QualType(),
	/TInfo=/nullptr, /StorageClass=/SC_None,
	/isInline=/true, Constexpr ? CSK_constexpr : CSK_unspecified,
	SourceLocation());
	MoveAssignment->setAccess(AS_public);
	MoveAssignment->setDefaulted();
	MoveAssignment->setImplicit();

	if (getLangOpts().CUDA) {
	inferCUDATargetForImplicitSpecialMember(ClassDecl, CXXMoveAssignment,
	MoveAssignment,
	/* ConstRHS */ false,
	/* Diagnose */ false);
	}

	// Build an exception specification pointing back at this member.
	FunctionProtoType::ExtProtoInfo EPI =
	getImplicitMethodEPI(*this, MoveAssignment);
	MoveAssignment->setType(Context.getFunctionType(RetType, ArgType, EPI));

	// Add the parameter to the operator.
	ParmVarDecl *FromParam = ParmVarDecl::Create(Context, MoveAssignment,
	ClassLoc, ClassLoc,
	/Id=/nullptr, ArgType,
	/TInfo=/nullptr, SC_None,
	nullptr);
	MoveAssignment->setParams(FromParam);

	MoveAssignment->setTrivial(
	ClassDecl->needsOverloadResolutionForMoveAssignment()
	? SpecialMemberIsTrivial(MoveAssignment, CXXMoveAssignment)
	: ClassDecl->hasTrivialMoveAssignment());

	// Note that we have added this copy-assignment operator.
	++getASTContext().NumImplicitMoveAssignmentOperatorsDeclared;

	Scope *S = getScopeForContext(ClassDecl);
	CheckImplicitSpecialMemberDeclaration(S, MoveAssignment);

	if (ShouldDeleteSpecialMember(MoveAssignment, CXXMoveAssignment)) {
	ClassDecl->setImplicitMoveAssignmentIsDeleted();
	SetDeclDeleted(MoveAssignment, ClassLoc);
	}

	if (S)
	PushOnScopeChains(MoveAssignment, S, false);
	ClassDecl->addDecl(MoveAssignment);

	return MoveAssignment;
	}

	/// Check if we're implicitly defining a move assignment operator for a class
	/// with virtual bases. Such a move assignment might move-assign the virtual
	/// base multiple times.
	static void checkMoveAssignmentForRepeatedMove(Sema &S, CXXRecordDecl *Class,
	SourceLocation CurrentLocation) {
	assert(!Class->isDependentContext() && "should not define dependent move");

	// Only a virtual base could get implicitly move-assigned multiple times.
	// Only a non-trivial move assignment can observe this. We only want to
	// diagnose if we implicitly define an assignment operator that assigns
	// two base classes, both of which move-assign the same virtual base.
	if (Class->getNumVBases() == 0 \|\| Class->hasTrivialMoveAssignment() \|\|
	Class->getNumBases() < 2)
	return;

	llvm::SmallVector<CXXBaseSpecifier *, 16> Worklist;
	typedef llvm::DenseMap<CXXRecordDecl, CXXBaseSpecifier> VBaseMap;
	VBaseMap VBases;

	for (auto &BI : Class->bases()) {
	Worklist.push_back(&BI);
	while (!Worklist.empty()) {
	CXXBaseSpecifier *BaseSpec = Worklist.pop_back_val();
	CXXRecordDecl *Base = BaseSpec->getType()->getAsCXXRecordDecl();

	// If the base has no non-trivial move assignment operators,
	// we don't care about moves from it.
	if (!Base->hasNonTrivialMoveAssignment())
	continue;

	// If there's nothing virtual here, skip it.
	if (!BaseSpec->isVirtual() && !Base->getNumVBases())
	continue;

	// If we're not actually going to call a move assignment for this base,
	// or the selected move assignment is trivial, skip it.
	Sema::SpecialMemberOverloadResult SMOR =
	S.LookupSpecialMember(Base, Sema::CXXMoveAssignment,
	/ConstArg/false, /VolatileArg/false,
	/RValueThis/true, /ConstThis/false,
	/VolatileThis/false);
	if (!SMOR.getMethod() \|\| SMOR.getMethod()->isTrivial() \|\|
	!SMOR.getMethod()->isMoveAssignmentOperator())
	continue;

	if (BaseSpec->isVirtual()) {
	// We're going to move-assign this virtual base, and its move
	// assignment operator is not trivial. If this can happen for
	// multiple distinct direct bases of Class, diagnose it. (If it
	// only happens in one base, we'll diagnose it when synthesizing
	// that base class's move assignment operator.)
	CXXBaseSpecifier *&Existing =
	VBases.insert(std::make_pair(Base->getCanonicalDecl(), &BI))
	.first->second;
	if (Existing && Existing != &BI) {
	S.Diag(CurrentLocation, diag::warn_vbase_moved_multiple_times)
	<< Class << Base;
	S.Diag(Existing->getBeginLoc(), diag::note_vbase_moved_here)
	<< (Base->getCanonicalDecl() ==
	Existing->getType()->getAsCXXRecordDecl()->getCanonicalDecl())
	<< Base << Existing->getType() << Existing->getSourceRange();
	S.Diag(BI.getBeginLoc(), diag::note_vbase_moved_here)
	<< (Base->getCanonicalDecl() ==
	BI.getType()->getAsCXXRecordDecl()->getCanonicalDecl())
	<< Base << BI.getType() << BaseSpec->getSourceRange();

	// Only diagnose each vbase once.
	Existing = nullptr;
	}
	} else {
	// Only walk over bases that have defaulted move assignment operators.
	// We assume that any user-provided move assignment operator handles
	// the multiple-moves-of-vbase case itself somehow.
	if (!SMOR.getMethod()->isDefaulted())
	continue;

	// We're going to move the base classes of Base. Add them to the list.
	for (auto &BI : Base->bases())
	Worklist.push_back(&BI);
	}
	}
	}
	}

	void Sema::DefineImplicitMoveAssignment(SourceLocation CurrentLocation,
	CXXMethodDecl *MoveAssignOperator) {
	assert((MoveAssignOperator->isDefaulted() &&
	MoveAssignOperator->isOverloadedOperator() &&
	MoveAssignOperator->getOverloadedOperator() == OO_Equal &&
	!MoveAssignOperator->doesThisDeclarationHaveABody() &&
	!MoveAssignOperator->isDeleted()) &&
	"DefineImplicitMoveAssignment called for wrong function");
	if (MoveAssignOperator->willHaveBody() \|\| MoveAssignOperator->isInvalidDecl())
	return;

	CXXRecordDecl *ClassDecl = MoveAssignOperator->getParent();
	if (ClassDecl->isInvalidDecl()) {
	MoveAssignOperator->setInvalidDecl();
	return;
	}

	// C++0x [class.copy]p28:
	// The implicitly-defined or move assignment operator for a non-union class
	// X performs memberwise move assignment of its subobjects. The direct base
	// classes of X are assigned first, in the order of their declaration in the
	// base-specifier-list, and then the immediate non-static data members of X
	// are assigned, in the order in which they were declared in the class
	// definition.

	// Issue a warning if our implicit move assignment operator will move
	// from a virtual base more than once.
	checkMoveAssignmentForRepeatedMove(*this, ClassDecl, CurrentLocation);

	SynthesizedFunctionScope Scope(*this, MoveAssignOperator);

	// The exception specification is needed because we are defining the
	// function.
	ResolveExceptionSpec(CurrentLocation,
	MoveAssignOperator->getType()->castAs<FunctionProtoType>());

	// Add a context note for diagnostics produced after this point.
	Scope.addContextNote(CurrentLocation);

	// The statements that form the synthesized function body.
	SmallVector<Stmt*, 8> Statements;

	// The parameter for the "other" object, which we are move from.
	ParmVarDecl *Other = MoveAssignOperator->getParamDecl(0);
	QualType OtherRefType = Other->getType()->
	getAs<RValueReferenceType>()->getPointeeType();

	// Our location for everything implicitly-generated.
	SourceLocation Loc = MoveAssignOperator->getEndLoc().isValid()
	? MoveAssignOperator->getEndLoc()
	: MoveAssignOperator->getLocation();

	// Builds a reference to the "other" object.
	RefBuilder OtherRef(Other, OtherRefType);
	// Cast to rvalue.
	MoveCastBuilder MoveOther(OtherRef);

	// Builds the "this" pointer.
	ThisBuilder This;

	// Assign base classes.
	bool Invalid = false;
	for (auto &Base : ClassDecl->bases()) {
	// C++11 [class.copy]p28:
	// It is unspecified whether subobjects representing virtual base classes
	// are assigned more than once by the implicitly-defined copy assignment
	// operator.
	// FIXME: Do not assign to a vbase that will be assigned by some other base
	// class. For a move-assignment, this can result in the vbase being moved
	// multiple times.

	// Form the assignment:
	// static_cast<Base*>(this)->Base::operator=(static_cast<Base&&>(other));
	QualType BaseType = Base.getType().getUnqualifiedType();
	if (!BaseType->isRecordType()) {
	Invalid = true;
	continue;
	}

	CXXCastPath BasePath;
	BasePath.push_back(&Base);

	// Construct the "from" expression, which is an implicit cast to the
	// appropriately-qualified base type.
	CastBuilder From(OtherRef, BaseType, VK_XValue, BasePath);

	// Dereference "this".
	DerefBuilder DerefThis(This);

	// Implicitly cast "this" to the appropriately-qualified base type.
	CastBuilder To(DerefThis,
	Context.getQualifiedType(
	BaseType, MoveAssignOperator->getMethodQualifiers()),
	VK_LValue, BasePath);

	// Build the move.
	StmtResult Move = buildSingleCopyAssign(*this, Loc, BaseType,
	To, From,
	/CopyingBaseSubobject=/true,
	/Copying=/false);
	if (Move.isInvalid()) {
	MoveAssignOperator->setInvalidDecl();
	return;
	}

	// Success! Record the move.
	Statements.push_back(Move.getAs<Expr>());
	}

	// Assign non-static members.
	for (auto *Field : ClassDecl->fields()) {
	// FIXME: We should form some kind of AST representation for the implied
	// memcpy in a union copy operation.
	if (Field->isUnnamedBitfield() \|\| Field->getParent()->isUnion())
	continue;

	if (Field->isInvalidDecl()) {
	Invalid = true;
	continue;
	}

	// Check for members of reference type; we can't move those.
	if (Field->getType()->isReferenceType()) {
	Diag(ClassDecl->getLocation(), diag::err_uninitialized_member_for_assign)
	<< Context.getTagDeclType(ClassDecl) << 0 << Field->getDeclName();
	Diag(Field->getLocation(), diag::note_declared_at);
	Invalid = true;
	continue;
	}

	// Check for members of const-qualified, non-class type.
	QualType BaseType = Context.getBaseElementType(Field->getType());
	if (!BaseType->getAs<RecordType>() && BaseType.isConstQualified()) {
	Diag(ClassDecl->getLocation(), diag::err_uninitialized_member_for_assign)
	<< Context.getTagDeclType(ClassDecl) << 1 << Field->getDeclName();
	Diag(Field->getLocation(), diag::note_declared_at);
	Invalid = true;
	continue;
	}

	// Suppress assigning zero-width bitfields.
	if (Field->isZeroLengthBitField(Context))
	continue;

	QualType FieldType = Field->getType().getNonReferenceType();
	if (FieldType->isIncompleteArrayType()) {
	assert(ClassDecl->hasFlexibleArrayMember() &&
	"Incomplete array type is not valid");
	continue;
	}

	// Build references to the field in the object we're copying from and to.
	LookupResult MemberLookup(*this, Field->getDeclName(), Loc,
	LookupMemberName);
	MemberLookup.addDecl(Field);
	MemberLookup.resolveKind();
	MemberBuilder From(MoveOther, OtherRefType,
	/IsArrow=/false, MemberLookup);
	MemberBuilder To(This, getCurrentThisType(),
	/IsArrow=/true, MemberLookup);

	assert(!From.build(*this, Loc)->isLValue() && // could be xvalue or prvalue
	"Member reference with rvalue base must be rvalue except for reference "
	"members, which aren't allowed for move assignment.");

	// Build the move of this field.
	StmtResult Move = buildSingleCopyAssign(*this, Loc, FieldType,
	To, From,
	/CopyingBaseSubobject=/false,
	/Copying=/false);
	if (Move.isInvalid()) {
	MoveAssignOperator->setInvalidDecl();
	return;
	}

	// Success! Record the copy.
	Statements.push_back(Move.getAs<Stmt>());
	}

	if (!Invalid) {
	// Add a "return *this;"
	ExprResult ThisObj =
	CreateBuiltinUnaryOp(Loc, UO_Deref, This.build(*this, Loc));

	StmtResult Return = BuildReturnStmt(Loc, ThisObj.get());
	if (Return.isInvalid())
	Invalid = true;
	else
	Statements.push_back(Return.getAs<Stmt>());
	}

	if (Invalid) {
	MoveAssignOperator->setInvalidDecl();
	return;
	}

	StmtResult Body;
	{
	CompoundScopeRAII CompoundScope(*this);
	Body = ActOnCompoundStmt(Loc, Loc, Statements,
	/isStmtExpr=/false);
	assert(!Body.isInvalid() && "Compound statement creation cannot fail");
	}
	MoveAssignOperator->setBody(Body.getAs<Stmt>());
	MoveAssignOperator->markUsed(Context);

	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(MoveAssignOperator);
	}
	}

	CXXConstructorDecl *Sema::DeclareImplicitCopyConstructor(
	CXXRecordDecl *ClassDecl) {
	// C++ [class.copy]p4:
	// If the class definition does not explicitly declare a copy
	// constructor, one is declared implicitly.
	assert(ClassDecl->needsImplicitCopyConstructor());

	DeclaringSpecialMember DSM(*this, ClassDecl, CXXCopyConstructor);
	if (DSM.isAlreadyBeingDeclared())
	return nullptr;

	QualType ClassType = Context.getTypeDeclType(ClassDecl);
	QualType ArgType = ClassType;
	bool Const = ClassDecl->implicitCopyConstructorHasConstParam();
	if (Const)
	ArgType = ArgType.withConst();

	if (Context.getLangOpts().OpenCLCPlusPlus)
	ArgType = Context.getAddrSpaceQualType(ArgType, LangAS::opencl_generic);

	ArgType = Context.getLValueReferenceType(ArgType);

	bool Constexpr = defaultedSpecialMemberIsConstexpr(*this, ClassDecl,
	CXXCopyConstructor,
	Const);

	DeclarationName Name
	= Context.DeclarationNames.getCXXConstructorName(
	Context.getCanonicalType(ClassType));
	SourceLocation ClassLoc = ClassDecl->getLocation();
	DeclarationNameInfo NameInfo(Name, ClassLoc);

	// An implicitly-declared copy constructor is an inline public
	// member of its class.
	CXXConstructorDecl *CopyConstructor = CXXConstructorDecl::Create(
	Context, ClassDecl, ClassLoc, NameInfo, QualType(), /TInfo=/nullptr,
	ExplicitSpecifier(),
	/isInline=/true,
	/isImplicitlyDeclared=/true,
	Constexpr ? CSK_constexpr : CSK_unspecified);
	CopyConstructor->setAccess(AS_public);
	CopyConstructor->setDefaulted();

	if (getLangOpts().CUDA) {
	inferCUDATargetForImplicitSpecialMember(ClassDecl, CXXCopyConstructor,
	CopyConstructor,
	/* ConstRHS */ Const,
	/* Diagnose */ false);
	}

	setupImplicitSpecialMemberType(CopyConstructor, Context.VoidTy, ArgType);

	// Add the parameter to the constructor.
	ParmVarDecl *FromParam = ParmVarDecl::Create(Context, CopyConstructor,
	ClassLoc, ClassLoc,
	/IdentifierInfo=/nullptr,
	ArgType, /TInfo=/nullptr,
	SC_None, nullptr);
	CopyConstructor->setParams(FromParam);

	CopyConstructor->setTrivial(
	ClassDecl->needsOverloadResolutionForCopyConstructor()
	? SpecialMemberIsTrivial(CopyConstructor, CXXCopyConstructor)
	: ClassDecl->hasTrivialCopyConstructor());

	CopyConstructor->setTrivialForCall(
	ClassDecl->hasAttr<TrivialABIAttr>() \|\|
	(ClassDecl->needsOverloadResolutionForCopyConstructor()
	? SpecialMemberIsTrivial(CopyConstructor, CXXCopyConstructor,
	TAH_ConsiderTrivialABI)
	: ClassDecl->hasTrivialCopyConstructorForCall()));

	// Note that we have declared this constructor.
	++getASTContext().NumImplicitCopyConstructorsDeclared;

	Scope *S = getScopeForContext(ClassDecl);
	CheckImplicitSpecialMemberDeclaration(S, CopyConstructor);

	if (ShouldDeleteSpecialMember(CopyConstructor, CXXCopyConstructor)) {
	ClassDecl->setImplicitCopyConstructorIsDeleted();
	SetDeclDeleted(CopyConstructor, ClassLoc);
	}

	if (S)
	PushOnScopeChains(CopyConstructor, S, false);
	ClassDecl->addDecl(CopyConstructor);

	return CopyConstructor;
	}

	void Sema::DefineImplicitCopyConstructor(SourceLocation CurrentLocation,
	CXXConstructorDecl *CopyConstructor) {
	assert((CopyConstructor->isDefaulted() &&
	CopyConstructor->isCopyConstructor() &&
	!CopyConstructor->doesThisDeclarationHaveABody() &&
	!CopyConstructor->isDeleted()) &&
	"DefineImplicitCopyConstructor - call it for implicit copy ctor");
	if (CopyConstructor->willHaveBody() \|\| CopyConstructor->isInvalidDecl())
	return;

	CXXRecordDecl *ClassDecl = CopyConstructor->getParent();
	assert(ClassDecl && "DefineImplicitCopyConstructor - invalid constructor");

	SynthesizedFunctionScope Scope(*this, CopyConstructor);

	// The exception specification is needed because we are defining the
	// function.
	ResolveExceptionSpec(CurrentLocation,
	CopyConstructor->getType()->castAs<FunctionProtoType>());
	MarkVTableUsed(CurrentLocation, ClassDecl);

	// Add a context note for diagnostics produced after this point.
	Scope.addContextNote(CurrentLocation);

	// C++11 [class.copy]p7:
	// The [definition of an implicitly declared copy constructor] is
	// deprecated if the class has a user-declared copy assignment operator
	// or a user-declared destructor.
	if (getLangOpts().CPlusPlus11 && CopyConstructor->isImplicit())
	diagnoseDeprecatedCopyOperation(*this, CopyConstructor);

	if (SetCtorInitializers(CopyConstructor, /AnyErrors=/false)) {
	CopyConstructor->setInvalidDecl();
	} else {
	SourceLocation Loc = CopyConstructor->getEndLoc().isValid()
	? CopyConstructor->getEndLoc()
	: CopyConstructor->getLocation();
	Sema::CompoundScopeRAII CompoundScope(*this);
	CopyConstructor->setBody(
	ActOnCompoundStmt(Loc, Loc, None, /isStmtExpr=/false).getAs<Stmt>());
	CopyConstructor->markUsed(Context);
	}

	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(CopyConstructor);
	}
	}

	CXXConstructorDecl *Sema::DeclareImplicitMoveConstructor(
	CXXRecordDecl *ClassDecl) {
	assert(ClassDecl->needsImplicitMoveConstructor());

	DeclaringSpecialMember DSM(*this, ClassDecl, CXXMoveConstructor);
	if (DSM.isAlreadyBeingDeclared())
	return nullptr;

	QualType ClassType = Context.getTypeDeclType(ClassDecl);

	QualType ArgType = ClassType;
	if (Context.getLangOpts().OpenCLCPlusPlus)
	ArgType = Context.getAddrSpaceQualType(ClassType, LangAS::opencl_generic);
	ArgType = Context.getRValueReferenceType(ArgType);

	bool Constexpr = defaultedSpecialMemberIsConstexpr(*this, ClassDecl,
	CXXMoveConstructor,
	false);

	DeclarationName Name
	= Context.DeclarationNames.getCXXConstructorName(
	Context.getCanonicalType(ClassType));
	SourceLocation ClassLoc = ClassDecl->getLocation();
	DeclarationNameInfo NameInfo(Name, ClassLoc);

	// C++11 [class.copy]p11:
	// An implicitly-declared copy/move constructor is an inline public
	// member of its class.
	CXXConstructorDecl *MoveConstructor = CXXConstructorDecl::Create(
	Context, ClassDecl, ClassLoc, NameInfo, QualType(), /TInfo=/nullptr,
	ExplicitSpecifier(),
	/isInline=/true,
	/isImplicitlyDeclared=/true,
	Constexpr ? CSK_constexpr : CSK_unspecified);
	MoveConstructor->setAccess(AS_public);
	MoveConstructor->setDefaulted();

	if (getLangOpts().CUDA) {
	inferCUDATargetForImplicitSpecialMember(ClassDecl, CXXMoveConstructor,
	MoveConstructor,
	/* ConstRHS */ false,
	/* Diagnose */ false);
	}

	setupImplicitSpecialMemberType(MoveConstructor, Context.VoidTy, ArgType);

	// Add the parameter to the constructor.
	ParmVarDecl *FromParam = ParmVarDecl::Create(Context, MoveConstructor,
	ClassLoc, ClassLoc,
	/IdentifierInfo=/nullptr,
	ArgType, /TInfo=/nullptr,
	SC_None, nullptr);
	MoveConstructor->setParams(FromParam);

	MoveConstructor->setTrivial(
	ClassDecl->needsOverloadResolutionForMoveConstructor()
	? SpecialMemberIsTrivial(MoveConstructor, CXXMoveConstructor)
	: ClassDecl->hasTrivialMoveConstructor());

	MoveConstructor->setTrivialForCall(
	ClassDecl->hasAttr<TrivialABIAttr>() \|\|
	(ClassDecl->needsOverloadResolutionForMoveConstructor()
	? SpecialMemberIsTrivial(MoveConstructor, CXXMoveConstructor,
	TAH_ConsiderTrivialABI)
	: ClassDecl->hasTrivialMoveConstructorForCall()));

	// Note that we have declared this constructor.
	++getASTContext().NumImplicitMoveConstructorsDeclared;

	Scope *S = getScopeForContext(ClassDecl);
	CheckImplicitSpecialMemberDeclaration(S, MoveConstructor);

	if (ShouldDeleteSpecialMember(MoveConstructor, CXXMoveConstructor)) {
	ClassDecl->setImplicitMoveConstructorIsDeleted();
	SetDeclDeleted(MoveConstructor, ClassLoc);
	}

	if (S)
	PushOnScopeChains(MoveConstructor, S, false);
	ClassDecl->addDecl(MoveConstructor);

	return MoveConstructor;
	}

	void Sema::DefineImplicitMoveConstructor(SourceLocation CurrentLocation,
	CXXConstructorDecl *MoveConstructor) {
	assert((MoveConstructor->isDefaulted() &&
	MoveConstructor->isMoveConstructor() &&
	!MoveConstructor->doesThisDeclarationHaveABody() &&
	!MoveConstructor->isDeleted()) &&
	"DefineImplicitMoveConstructor - call it for implicit move ctor");
	if (MoveConstructor->willHaveBody() \|\| MoveConstructor->isInvalidDecl())
	return;

	CXXRecordDecl *ClassDecl = MoveConstructor->getParent();
	assert(ClassDecl && "DefineImplicitMoveConstructor - invalid constructor");

	SynthesizedFunctionScope Scope(*this, MoveConstructor);

	// The exception specification is needed because we are defining the
	// function.
	ResolveExceptionSpec(CurrentLocation,
	MoveConstructor->getType()->castAs<FunctionProtoType>());
	MarkVTableUsed(CurrentLocation, ClassDecl);

	// Add a context note for diagnostics produced after this point.
	Scope.addContextNote(CurrentLocation);

	if (SetCtorInitializers(MoveConstructor, /AnyErrors=/false)) {
	MoveConstructor->setInvalidDecl();
	} else {
	SourceLocation Loc = MoveConstructor->getEndLoc().isValid()
	? MoveConstructor->getEndLoc()
	: MoveConstructor->getLocation();
	Sema::CompoundScopeRAII CompoundScope(*this);
	MoveConstructor->setBody(ActOnCompoundStmt(
	Loc, Loc, None, /isStmtExpr=/ false).getAs<Stmt>());
	MoveConstructor->markUsed(Context);
	}

	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(MoveConstructor);
	}
	}

	bool Sema::isImplicitlyDeleted(FunctionDecl *FD) {
	return FD->isDeleted() && FD->isDefaulted() && isa<CXXMethodDecl>(FD);
	}

	void Sema::DefineImplicitLambdaToFunctionPointerConversion(
	SourceLocation CurrentLocation,
	CXXConversionDecl *Conv) {
	SynthesizedFunctionScope Scope(*this, Conv);
	assert(!Conv->getReturnType()->isUndeducedType());

	CXXRecordDecl *Lambda = Conv->getParent();
	FunctionDecl *CallOp = Lambda->getLambdaCallOperator();
	FunctionDecl *Invoker = Lambda->getLambdaStaticInvoker();

	if (auto *TemplateArgs = Conv->getTemplateSpecializationArgs()) {
	CallOp = InstantiateFunctionDeclaration(
	CallOp->getDescribedFunctionTemplate(), TemplateArgs, CurrentLocation);
	if (!CallOp)
	return;

	Invoker = InstantiateFunctionDeclaration(
	Invoker->getDescribedFunctionTemplate(), TemplateArgs, CurrentLocation);
	if (!Invoker)
	return;
	}

	if (CallOp->isInvalidDecl())
	return;

	// Mark the call operator referenced (and add to pending instantiations
	// if necessary).
	// For both the conversion and static-invoker template specializations
	// we construct their body's in this function, so no need to add them
	// to the PendingInstantiations.
	MarkFunctionReferenced(CurrentLocation, CallOp);

	// Fill in the __invoke function with a dummy implementation. IR generation
	// will fill in the actual details. Update its type in case it contained
	// an 'auto'.
	Invoker->markUsed(Context);
	Invoker->setReferenced();
	Invoker->setType(Conv->getReturnType()->getPointeeType());
	Invoker->setBody(new (Context) CompoundStmt(Conv->getLocation()));

	// Construct the body of the conversion function { return __invoke; }.
	Expr *FunctionRef = BuildDeclRefExpr(Invoker, Invoker->getType(),
	VK_LValue, Conv->getLocation());
	assert(FunctionRef && "Can't refer to __invoke function?");
	Stmt *Return = BuildReturnStmt(Conv->getLocation(), FunctionRef).get();
	Conv->setBody(CompoundStmt::Create(Context, Return, Conv->getLocation(),
	Conv->getLocation()));
	Conv->markUsed(Context);
	Conv->setReferenced();

	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(Conv);
	L->CompletedImplicitDefinition(Invoker);
	}
	}



	void Sema::DefineImplicitLambdaToBlockPointerConversion(
	SourceLocation CurrentLocation,
	CXXConversionDecl *Conv)
	{
	assert(!Conv->getParent()->isGenericLambda());

	SynthesizedFunctionScope Scope(*this, Conv);

	// Copy-initialize the lambda object as needed to capture it.
	Expr *This = ActOnCXXThis(CurrentLocation).get();
	Expr *DerefThis =CreateBuiltinUnaryOp(CurrentLocation, UO_Deref, This).get();

	ExprResult BuildBlock = BuildBlockForLambdaConversion(CurrentLocation,
	Conv->getLocation(),
	Conv, DerefThis);

	// If we're not under ARC, make sure we still get the _Block_copy/autorelease
	// behavior. Note that only the general conversion function does this
	// (since it's unusable otherwise); in the case where we inline the
	// block literal, it has block literal lifetime semantics.
	if (!BuildBlock.isInvalid() && !getLangOpts().ObjCAutoRefCount)
	BuildBlock = ImplicitCastExpr::Create(Context, BuildBlock.get()->getType(),
	CK_CopyAndAutoreleaseBlockObject,
	BuildBlock.get(), nullptr, VK_RValue);

	if (BuildBlock.isInvalid()) {
	Diag(CurrentLocation, diag::note_lambda_to_block_conv);
	Conv->setInvalidDecl();
	return;
	}

	// Create the return statement that returns the block from the conversion
	// function.
	StmtResult Return = BuildReturnStmt(Conv->getLocation(), BuildBlock.get());
	if (Return.isInvalid()) {
	Diag(CurrentLocation, diag::note_lambda_to_block_conv);
	Conv->setInvalidDecl();
	return;
	}

	// Set the body of the conversion function.
	Stmt *ReturnS = Return.get();
	Conv->setBody(CompoundStmt::Create(Context, ReturnS, Conv->getLocation(),
	Conv->getLocation()));
	Conv->markUsed(Context);

	// We're done; notify the mutation listener, if any.
	if (ASTMutationListener *L = getASTMutationListener()) {
	L->CompletedImplicitDefinition(Conv);
	}
	}

	/// Determine whether the given list arguments contains exactly one
	/// "real" (non-default) argument.
	static bool hasOneRealArgument(MultiExprArg Args) {
	switch (Args.size()) {
	case 0:
	return false;

	default:
	if (!Args[1]->isDefaultArgument())
	return false;

	LLVM_FALLTHROUGH;
	case 1:
	return !Args[0]->isDefaultArgument();
	}

	return false;
	}

	ExprResult
	Sema::BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
	NamedDecl *FoundDecl,
	CXXConstructorDecl *Constructor,
	MultiExprArg ExprArgs,
	bool HadMultipleCandidates,
	bool IsListInitialization,
	bool IsStdInitListInitialization,
	bool RequiresZeroInit,
	unsigned ConstructKind,
	SourceRange ParenRange) {
	bool Elidable = false;

	// C++0x [class.copy]p34:
	// When certain criteria are met, an implementation is allowed to
	// omit the copy/move construction of a class object, even if the
	// copy/move constructor and/or destructor for the object have
	// side effects. [...]
	// - when a temporary class object that has not been bound to a
	// reference (12.2) would be copied/moved to a class object
	// with the same cv-unqualified type, the copy/move operation
	// can be omitted by constructing the temporary object
	// directly into the target of the omitted copy/move
	if (ConstructKind == CXXConstructExpr::CK_Complete && Constructor &&
	Constructor->isCopyOrMoveConstructor() && hasOneRealArgument(ExprArgs)) {
	Expr *SubExpr = ExprArgs[0];
	Elidable = SubExpr->isTemporaryObject(
	Context, cast<CXXRecordDecl>(FoundDecl->getDeclContext()));
	}

	return BuildCXXConstructExpr(ConstructLoc, DeclInitType,
	FoundDecl, Constructor,
	Elidable, ExprArgs, HadMultipleCandidates,
	IsListInitialization,
	IsStdInitListInitialization, RequiresZeroInit,
	ConstructKind, ParenRange);
	}

	ExprResult
	Sema::BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
	NamedDecl *FoundDecl,
	CXXConstructorDecl *Constructor,
	bool Elidable,
	MultiExprArg ExprArgs,
	bool HadMultipleCandidates,
	bool IsListInitialization,
	bool IsStdInitListInitialization,
	bool RequiresZeroInit,
	unsigned ConstructKind,
	SourceRange ParenRange) {
	if (auto *Shadow = dyn_cast<ConstructorUsingShadowDecl>(FoundDecl)) {
	Constructor = findInheritingConstructor(ConstructLoc, Constructor, Shadow);
	if (DiagnoseUseOfDecl(Constructor, ConstructLoc))
	return ExprError();
	}

	return BuildCXXConstructExpr(
	ConstructLoc, DeclInitType, Constructor, Elidable, ExprArgs,
	HadMultipleCandidates, IsListInitialization, IsStdInitListInitialization,
	RequiresZeroInit, ConstructKind, ParenRange);
	}

	/// BuildCXXConstructExpr - Creates a complete call to a constructor,
	/// including handling of its default argument expressions.
	ExprResult
	Sema::BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
	CXXConstructorDecl *Constructor,
	bool Elidable,
	MultiExprArg ExprArgs,
	bool HadMultipleCandidates,
	bool IsListInitialization,
	bool IsStdInitListInitialization,
	bool RequiresZeroInit,
	unsigned ConstructKind,
	SourceRange ParenRange) {
	assert(declaresSameEntity(
	Constructor->getParent(),
	DeclInitType->getBaseElementTypeUnsafe()->getAsCXXRecordDecl()) &&
	"given constructor for wrong type");
	MarkFunctionReferenced(ConstructLoc, Constructor);
	if (getLangOpts().CUDA && !CheckCUDACall(ConstructLoc, Constructor))
	return ExprError();

	return CXXConstructExpr::Create(
	Context, DeclInitType, ConstructLoc, Constructor, Elidable,
	ExprArgs, HadMultipleCandidates, IsListInitialization,
	IsStdInitListInitialization, RequiresZeroInit,
	static_cast<CXXConstructExpr::ConstructionKind>(ConstructKind),
	ParenRange);
	}

	ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
	assert(Field->hasInClassInitializer());

	// If we already have the in-class initializer nothing needs to be done.
	if (Field->getInClassInitializer())
	return CXXDefaultInitExpr::Create(Context, Loc, Field, CurContext);

	// If we might have already tried and failed to instantiate, don't try again.
	if (Field->isInvalidDecl())
	return ExprError();

	// Maybe we haven't instantiated the in-class initializer. Go check the
	// pattern FieldDecl to see if it has one.
	CXXRecordDecl *ParentRD = cast<CXXRecordDecl>(Field->getParent());

	if (isTemplateInstantiation(ParentRD->getTemplateSpecializationKind())) {
	CXXRecordDecl *ClassPattern = ParentRD->getTemplateInstantiationPattern();
	DeclContext::lookup_result Lookup =
	ClassPattern->lookup(Field->getDeclName());

	// Lookup can return at most two results: the pattern for the field, or the
	// injected class name of the parent record. No other member can have the
	// same name as the field.
	// In modules mode, lookup can return multiple results (coming from
	// different modules).
	assert((getLangOpts().Modules \|\| (!Lookup.empty() && Lookup.size() <= 2)) &&
	"more than two lookup results for field name");
	FieldDecl *Pattern = dyn_cast<FieldDecl>(Lookup[0]);
	if (!Pattern) {
	assert(isa<CXXRecordDecl>(Lookup[0]) &&
	"cannot have other non-field member with same name");
	for (auto L : Lookup)
	if (isa<FieldDecl>(L)) {
	Pattern = cast<FieldDecl>(L);
	break;
	}
	assert(Pattern && "We must have set the Pattern!");
	}

	if (!Pattern->hasInClassInitializer() \|\|
	InstantiateInClassInitializer(Loc, Field, Pattern,
	getTemplateInstantiationArgs(Field))) {
	// Don't diagnose this again.
	Field->setInvalidDecl();
	return ExprError();
	}
	return CXXDefaultInitExpr::Create(Context, Loc, Field, CurContext);
	}

	// DR1351:
	// If the brace-or-equal-initializer of a non-static data member
	// invokes a defaulted default constructor of its class or of an
	// enclosing class in a potentially evaluated subexpression, the
	// program is ill-formed.
	//
	// This resolution is unworkable: the exception specification of the
	// default constructor can be needed in an unevaluated context, in
	// particular, in the operand of a noexcept-expression, and we can be
	// unable to compute an exception specification for an enclosed class.
	//
	// Any attempt to resolve the exception specification of a defaulted default
	// constructor before the initializer is lexically complete will ultimately
	// come here at which point we can diagnose it.
	RecordDecl *OutermostClass = ParentRD->getOuterLexicalRecordContext();
	Diag(Loc, diag::err_in_class_initializer_not_yet_parsed)
	<< OutermostClass << Field;
	Diag(Field->getEndLoc(), diag::note_in_class_initializer_not_yet_parsed);
	// Recover by marking the field invalid, unless we're in a SFINAE context.
	if (!isSFINAEContext())
	Field->setInvalidDecl();
	return ExprError();
	}

	void Sema::FinalizeVarWithDestructor(VarDecl VD, const RecordType Record) {
	if (VD->isInvalidDecl()) return;

	CXXRecordDecl *ClassDecl = cast<CXXRecordDecl>(Record->getDecl());
	if (ClassDecl->isInvalidDecl()) return;
	if (ClassDecl->hasIrrelevantDestructor()) return;
	if (ClassDecl->isDependentContext()) return;

	if (VD->isNoDestroy(getASTContext()))
	return;

	CXXDestructorDecl *Destructor = LookupDestructor(ClassDecl);

	// If this is an array, we'll require the destructor during initialization, so
	// we can skip over this. We still want to emit exit-time destructor warnings
	// though.
	if (!VD->getType()->isArrayType()) {
	MarkFunctionReferenced(VD->getLocation(), Destructor);
	CheckDestructorAccess(VD->getLocation(), Destructor,
	PDiag(diag::err_access_dtor_var)
	<< VD->getDeclName() << VD->getType());
	DiagnoseUseOfDecl(Destructor, VD->getLocation());
	}

	if (Destructor->isTrivial()) return;
	if (!VD->hasGlobalStorage()) return;

	// Emit warning for non-trivial dtor in global scope (a real global,
	// class-static, function-static).
	Diag(VD->getLocation(), diag::warn_exit_time_destructor);

	// TODO: this should be re-enabled for static locals by !CXAAtExit
	if (!VD->isStaticLocal())
	Diag(VD->getLocation(), diag::warn_global_destructor);
	}

	/// Given a constructor and the set of arguments provided for the
	/// constructor, convert the arguments and add any required default arguments
	/// to form a proper call to this constructor.
	///
	/// \returns true if an error occurred, false otherwise.
	bool
	Sema::CompleteConstructorCall(CXXConstructorDecl *Constructor,
	MultiExprArg ArgsPtr,
	SourceLocation Loc,
	SmallVectorImpl<Expr*> &ConvertedArgs,
	bool AllowExplicit,
	bool IsListInitialization) {
	// FIXME: This duplicates a lot of code from Sema::ConvertArgumentsForCall.
	unsigned NumArgs = ArgsPtr.size();
	Expr **Args = ArgsPtr.data();

	const FunctionProtoType *Proto
	= Constructor->getType()->getAs<FunctionProtoType>();
	assert(Proto && "Constructor without a prototype?");
	unsigned NumParams = Proto->getNumParams();

	// If too few arguments are available, we'll fill in the rest with defaults.
	if (NumArgs < NumParams)
	ConvertedArgs.reserve(NumParams);
	else
	ConvertedArgs.reserve(NumArgs);

	VariadicCallType CallType =
	Proto->isVariadic() ? VariadicConstructor : VariadicDoesNotApply;
	SmallVector<Expr *, 8> AllArgs;
	bool Invalid = GatherArgumentsForCall(Loc, Constructor,
	Proto, 0,
	llvm::makeArrayRef(Args, NumArgs),
	AllArgs,
	CallType, AllowExplicit,
	IsListInitialization);
	ConvertedArgs.append(AllArgs.begin(), AllArgs.end());

	DiagnoseSentinelCalls(Constructor, Loc, AllArgs);

	CheckConstructorCall(Constructor,
	llvm::makeArrayRef(AllArgs.data(), AllArgs.size()),
	Proto, Loc);

	return Invalid;
	}

	static inline bool
	CheckOperatorNewDeleteDeclarationScope(Sema &SemaRef,
	const FunctionDecl *FnDecl) {
	const DeclContext *DC = FnDecl->getDeclContext()->getRedeclContext();
	if (isa<NamespaceDecl>(DC)) {
	return SemaRef.Diag(FnDecl->getLocation(),
	diag::err_operator_new_delete_declared_in_namespace)
	<< FnDecl->getDeclName();
	}

	if (isa<TranslationUnitDecl>(DC) &&
	FnDecl->getStorageClass() == SC_Static) {
	return SemaRef.Diag(FnDecl->getLocation(),
	diag::err_operator_new_delete_declared_static)
	<< FnDecl->getDeclName();
	}

	return false;
	}

	static QualType
	RemoveAddressSpaceFromPtr(Sema &SemaRef, const PointerType *PtrTy) {
	QualType QTy = PtrTy->getPointeeType();
	QTy = SemaRef.Context.removeAddrSpaceQualType(QTy);
	return SemaRef.Context.getPointerType(QTy);
	}

	static inline bool
	CheckOperatorNewDeleteTypes(Sema &SemaRef, const FunctionDecl *FnDecl,
	CanQualType ExpectedResultType,
	CanQualType ExpectedFirstParamType,
	unsigned DependentParamTypeDiag,
	unsigned InvalidParamTypeDiag) {
	QualType ResultType =
	FnDecl->getType()->getAs<FunctionType>()->getReturnType();

	// Check that the result type is not dependent.
	if (ResultType->isDependentType())
	return SemaRef.Diag(FnDecl->getLocation(),
	diag::err_operator_new_delete_dependent_result_type)
	<< FnDecl->getDeclName() << ExpectedResultType;

	// The operator is valid on any address space for OpenCL.
	if (SemaRef.getLangOpts().OpenCLCPlusPlus) {
	if (auto *PtrTy = ResultType->getAs<PointerType>()) {
	ResultType = RemoveAddressSpaceFromPtr(SemaRef, PtrTy);
	}
	}

	// Check that the result type is what we expect.
	if (SemaRef.Context.getCanonicalType(ResultType) != ExpectedResultType)
	return SemaRef.Diag(FnDecl->getLocation(),
	diag::err_operator_new_delete_invalid_result_type)
	<< FnDecl->getDeclName() << ExpectedResultType;

	// A function template must have at least 2 parameters.
	if (FnDecl->getDescribedFunctionTemplate() && FnDecl->getNumParams() < 2)
	return SemaRef.Diag(FnDecl->getLocation(),
	diag::err_operator_new_delete_template_too_few_parameters)
	<< FnDecl->getDeclName();

	// The function decl must have at least 1 parameter.
	if (FnDecl->getNumParams() == 0)
	return SemaRef.Diag(FnDecl->getLocation(),
	diag::err_operator_new_delete_too_few_parameters)
	<< FnDecl->getDeclName();

	// Check the first parameter type is not dependent.
	QualType FirstParamType = FnDecl->getParamDecl(0)->getType();
	if (FirstParamType->isDependentType())
	return SemaRef.Diag(FnDecl->getLocation(), DependentParamTypeDiag)
	<< FnDecl->getDeclName() << ExpectedFirstParamType;

	// Check that the first parameter type is what we expect.
	if (SemaRef.getLangOpts().OpenCLCPlusPlus) {
	// The operator is valid on any address space for OpenCL.
	if (auto *PtrTy =
	FnDecl->getParamDecl(0)->getType()->getAs<PointerType>()) {
	FirstParamType = RemoveAddressSpaceFromPtr(SemaRef, PtrTy);
	}
	}
	if (SemaRef.Context.getCanonicalType(FirstParamType).getUnqualifiedType() !=
	ExpectedFirstParamType)
	return SemaRef.Diag(FnDecl->getLocation(), InvalidParamTypeDiag)
	<< FnDecl->getDeclName() << ExpectedFirstParamType;

	return false;
	}

	static bool
	CheckOperatorNewDeclaration(Sema &SemaRef, const FunctionDecl *FnDecl) {
	// C++ [basic.stc.dynamic.allocation]p1:
	// A program is ill-formed if an allocation function is declared in a
	// namespace scope other than global scope or declared static in global
	// scope.
	if (CheckOperatorNewDeleteDeclarationScope(SemaRef, FnDecl))
	return true;

	CanQualType SizeTy =
	SemaRef.Context.getCanonicalType(SemaRef.Context.getSizeType());

	// C++ [basic.stc.dynamic.allocation]p1:
	// The return type shall be void*. The first parameter shall have type
	// std::size_t.
	if (CheckOperatorNewDeleteTypes(SemaRef, FnDecl, SemaRef.Context.VoidPtrTy,
	SizeTy,
	diag::err_operator_new_dependent_param_type,
	diag::err_operator_new_param_type))
	return true;

	// C++ [basic.stc.dynamic.allocation]p1:
	// The first parameter shall not have an associated default argument.
	if (FnDecl->getParamDecl(0)->hasDefaultArg())
	return SemaRef.Diag(FnDecl->getLocation(),
	diag::err_operator_new_default_arg)
	<< FnDecl->getDeclName() << FnDecl->getParamDecl(0)->getDefaultArgRange();

	return false;
	}

	static bool
	CheckOperatorDeleteDeclaration(Sema &SemaRef, FunctionDecl *FnDecl) {
	// C++ [basic.stc.dynamic.deallocation]p1:
	// A program is ill-formed if deallocation functions are declared in a
	// namespace scope other than global scope or declared static in global
	// scope.
	if (CheckOperatorNewDeleteDeclarationScope(SemaRef, FnDecl))
	return true;

	auto *MD = dyn_cast<CXXMethodDecl>(FnDecl);

	// C++ P0722:
	// Within a class C, the first parameter of a destroying operator delete
	// shall be of type C *. The first parameter of any other deallocation
	// function shall be of type void *.
	CanQualType ExpectedFirstParamType =
	MD && MD->isDestroyingOperatorDelete()
	? SemaRef.Context.getCanonicalType(SemaRef.Context.getPointerType(
	SemaRef.Context.getRecordType(MD->getParent())))
	: SemaRef.Context.VoidPtrTy;

	// C++ [basic.stc.dynamic.deallocation]p2:
	// Each deallocation function shall return void
	if (CheckOperatorNewDeleteTypes(
	SemaRef, FnDecl, SemaRef.Context.VoidTy, ExpectedFirstParamType,
	diag::err_operator_delete_dependent_param_type,
	diag::err_operator_delete_param_type))
	return true;

	// C++ P0722:
	// A destroying operator delete shall be a usual deallocation function.
	if (MD && !MD->getParent()->isDependentContext() &&
	MD->isDestroyingOperatorDelete() &&
	!SemaRef.isUsualDeallocationFunction(MD)) {
	SemaRef.Diag(MD->getLocation(),
	diag::err_destroying_operator_delete_not_usual);
	return true;
	}

	return false;
	}

	/// CheckOverloadedOperatorDeclaration - Check whether the declaration
	/// of this overloaded operator is well-formed. If so, returns false;
	/// otherwise, emits appropriate diagnostics and returns true.
	bool Sema::CheckOverloadedOperatorDeclaration(FunctionDecl *FnDecl) {
	assert(FnDecl && FnDecl->isOverloadedOperator() &&
	"Expected an overloaded operator declaration");

	OverloadedOperatorKind Op = FnDecl->getOverloadedOperator();

	// C++ [over.oper]p5:
	// The allocation and deallocation functions, operator new,
	// operator new[], operator delete and operator delete[], are
	// described completely in 3.7.3. The attributes and restrictions
	// found in the rest of this subclause do not apply to them unless
	// explicitly stated in 3.7.3.
	if (Op == OO_Delete \|\| Op == OO_Array_Delete)
	return CheckOperatorDeleteDeclaration(*this, FnDecl);

	if (Op == OO_New \|\| Op == OO_Array_New)
	return CheckOperatorNewDeclaration(*this, FnDecl);

	// C++ [over.oper]p6:
	// An operator function shall either be a non-static member
	// function or be a non-member function and have at least one
	// parameter whose type is a class, a reference to a class, an
	// enumeration, or a reference to an enumeration.
	if (CXXMethodDecl *MethodDecl = dyn_cast<CXXMethodDecl>(FnDecl)) {
	if (MethodDecl->isStatic())
	return Diag(FnDecl->getLocation(),
	diag::err_operator_overload_static) << FnDecl->getDeclName();
	} else {
	bool ClassOrEnumParam = false;
	for (auto Param : FnDecl->parameters()) {
	QualType ParamType = Param->getType().getNonReferenceType();
	if (ParamType->isDependentType() \|\| ParamType->isRecordType() \|\|
	ParamType->isEnumeralType()) {
	ClassOrEnumParam = true;
	break;
	}
	}

	if (!ClassOrEnumParam)
	return Diag(FnDecl->getLocation(),
	diag::err_operator_overload_needs_class_or_enum)
	<< FnDecl->getDeclName();
	}

	// C++ [over.oper]p8:
	// An operator function cannot have default arguments (8.3.6),
	// except where explicitly stated below.
	//
	// Only the function-call operator allows default arguments
	// (C++ [over.call]p1).
	if (Op != OO_Call) {
	for (auto Param : FnDecl->parameters()) {
	if (Param->hasDefaultArg())
	return Diag(Param->getLocation(),
	diag::err_operator_overload_default_arg)
	<< FnDecl->getDeclName() << Param->getDefaultArgRange();
	}
	}

	static const bool OperatorUses[NUM_OVERLOADED_OPERATORS][3] = {
	{ false, false, false }
	#define OVERLOADED_OPERATOR(Name,Spelling,Token,Unary,Binary,MemberOnly) \
	, { Unary, Binary, MemberOnly }
	#include "clang/Basic/OperatorKinds.def"
	};

	bool CanBeUnaryOperator = OperatorUses[Op][0];
	bool CanBeBinaryOperator = OperatorUses[Op][1];
	bool MustBeMemberOperator = OperatorUses[Op][2];

	// C++ [over.oper]p8:
	// [...] Operator functions cannot have more or fewer parameters
	// than the number required for the corresponding operator, as
	// described in the rest of this subclause.
	unsigned NumParams = FnDecl->getNumParams()
	+ (isa<CXXMethodDecl>(FnDecl)? 1 : 0);
	if (Op != OO_Call &&
	((NumParams == 1 && !CanBeUnaryOperator) \|\|
	(NumParams == 2 && !CanBeBinaryOperator) \|\|
	(NumParams < 1) \|\| (NumParams > 2))) {
	// We have the wrong number of parameters.
	unsigned ErrorKind;
	if (CanBeUnaryOperator && CanBeBinaryOperator) {
	ErrorKind = 2; // 2 -> unary or binary.
	} else if (CanBeUnaryOperator) {
	ErrorKind = 0; // 0 -> unary
	} else {
	assert(CanBeBinaryOperator &&
	"All non-call overloaded operators are unary or binary!");
	ErrorKind = 1; // 1 -> binary
	}

	return Diag(FnDecl->getLocation(), diag::err_operator_overload_must_be)
	<< FnDecl->getDeclName() << NumParams << ErrorKind;
	}

	// Overloaded operators other than operator() cannot be variadic.
	if (Op != OO_Call &&
	FnDecl->getType()->getAs<FunctionProtoType>()->isVariadic()) {
	return Diag(FnDecl->getLocation(), diag::err_operator_overload_variadic)
	<< FnDecl->getDeclName();
	}

	// Some operators must be non-static member functions.
	if (MustBeMemberOperator && !isa<CXXMethodDecl>(FnDecl)) {
	return Diag(FnDecl->getLocation(),
	diag::err_operator_overload_must_be_member)
	<< FnDecl->getDeclName();
	}

	// C++ [over.inc]p1:
	// The user-defined function called operator++ implements the
	// prefix and postfix ++ operator. If this function is a member
	// function with no parameters, or a non-member function with one
	// parameter of class or enumeration type, it defines the prefix
	// increment operator ++ for objects of that type. If the function
	// is a member function with one parameter (which shall be of type
	// int) or a non-member function with two parameters (the second
	// of which shall be of type int), it defines the postfix
	// increment operator ++ for objects of that type.
	if ((Op == OO_PlusPlus \|\| Op == OO_MinusMinus) && NumParams == 2) {
	ParmVarDecl *LastParam = FnDecl->getParamDecl(FnDecl->getNumParams() - 1);
	QualType ParamType = LastParam->getType();

	if (!ParamType->isSpecificBuiltinType(BuiltinType::Int) &&
	!ParamType->isDependentType())
	return Diag(LastParam->getLocation(),
	diag::err_operator_overload_post_incdec_must_be_int)
	<< LastParam->getType() << (Op == OO_MinusMinus);
	}

	return false;
	}

	static bool
	checkLiteralOperatorTemplateParameterList(Sema &SemaRef,
	FunctionTemplateDecl *TpDecl) {
	TemplateParameterList *TemplateParams = TpDecl->getTemplateParameters();

	// Must have one or two template parameters.
	if (TemplateParams->size() == 1) {
	NonTypeTemplateParmDecl *PmDecl =
	dyn_cast<NonTypeTemplateParmDecl>(TemplateParams->getParam(0));

	// The template parameter must be a char parameter pack.
	if (PmDecl && PmDecl->isTemplateParameterPack() &&
	SemaRef.Context.hasSameType(PmDecl->getType(), SemaRef.Context.CharTy))
	return false;

	} else if (TemplateParams->size() == 2) {
	TemplateTypeParmDecl *PmType =
	dyn_cast<TemplateTypeParmDecl>(TemplateParams->getParam(0));
	NonTypeTemplateParmDecl *PmArgs =
	dyn_cast<NonTypeTemplateParmDecl>(TemplateParams->getParam(1));

	// The second template parameter must be a parameter pack with the
	// first template parameter as its type.
	if (PmType && PmArgs && !PmType->isTemplateParameterPack() &&
	PmArgs->isTemplateParameterPack()) {
	const TemplateTypeParmType *TArgs =
	PmArgs->getType()->getAs<TemplateTypeParmType>();
	if (TArgs && TArgs->getDepth() == PmType->getDepth() &&
	TArgs->getIndex() == PmType->getIndex()) {
	if (!SemaRef.inTemplateInstantiation())
	SemaRef.Diag(TpDecl->getLocation(),
	diag::ext_string_literal_operator_template);
	return false;
	}
	}
	}

	SemaRef.Diag(TpDecl->getTemplateParameters()->getSourceRange().getBegin(),
	diag::err_literal_operator_template)
	<< TpDecl->getTemplateParameters()->getSourceRange();
	return true;
	}

	/// CheckLiteralOperatorDeclaration - Check whether the declaration
	/// of this literal operator function is well-formed. If so, returns
	/// false; otherwise, emits appropriate diagnostics and returns true.
	bool Sema::CheckLiteralOperatorDeclaration(FunctionDecl *FnDecl) {
	if (isa<CXXMethodDecl>(FnDecl)) {
	Diag(FnDecl->getLocation(), diag::err_literal_operator_outside_namespace)
	<< FnDecl->getDeclName();
	return true;
	}

	if (FnDecl->isExternC()) {
	Diag(FnDecl->getLocation(), diag::err_literal_operator_extern_c);
	if (const LinkageSpecDecl *LSD =
	FnDecl->getDeclContext()->getExternCContext())
	Diag(LSD->getExternLoc(), diag::note_extern_c_begins_here);
	return true;
	}

	// This might be the definition of a literal operator template.
	FunctionTemplateDecl *TpDecl = FnDecl->getDescribedFunctionTemplate();

	// This might be a specialization of a literal operator template.
	if (!TpDecl)
	TpDecl = FnDecl->getPrimaryTemplate();

	// template <char...> type operator "" name() and
	// template <class T, T...> type operator "" name() are the only valid
	// template signatures, and the only valid signatures with no parameters.
	if (TpDecl) {
	if (FnDecl->param_size() != 0) {
	Diag(FnDecl->getLocation(),
	diag::err_literal_operator_template_with_params);
	return true;
	}

	if (checkLiteralOperatorTemplateParameterList(*this, TpDecl))
	return true;

	} else if (FnDecl->param_size() == 1) {
	const ParmVarDecl *Param = FnDecl->getParamDecl(0);

	QualType ParamType = Param->getType().getUnqualifiedType();

	// Only unsigned long long int, long double, any character type, and const
	// char * are allowed as the only parameters.
	if (ParamType->isSpecificBuiltinType(BuiltinType::ULongLong) \|\|
	ParamType->isSpecificBuiltinType(BuiltinType::LongDouble) \|\|
	Context.hasSameType(ParamType, Context.CharTy) \|\|
	Context.hasSameType(ParamType, Context.WideCharTy) \|\|
	Context.hasSameType(ParamType, Context.Char8Ty) \|\|
	Context.hasSameType(ParamType, Context.Char16Ty) \|\|
	Context.hasSameType(ParamType, Context.Char32Ty)) {
	} else if (const PointerType *Ptr = ParamType->getAs<PointerType>()) {
	QualType InnerType = Ptr->getPointeeType();

	// Pointer parameter must be a const char *.
	if (!(Context.hasSameType(InnerType.getUnqualifiedType(),
	Context.CharTy) &&
	InnerType.isConstQualified() && !InnerType.isVolatileQualified())) {
	Diag(Param->getSourceRange().getBegin(),
	diag::err_literal_operator_param)
	<< ParamType << "'const char *'" << Param->getSourceRange();
	return true;
	}

	} else if (ParamType->isRealFloatingType()) {
	Diag(Param->getSourceRange().getBegin(), diag::err_literal_operator_param)
	<< ParamType << Context.LongDoubleTy << Param->getSourceRange();
	return true;

	} else if (ParamType->isIntegerType()) {
	Diag(Param->getSourceRange().getBegin(), diag::err_literal_operator_param)
	<< ParamType << Context.UnsignedLongLongTy << Param->getSourceRange();
	return true;

	} else {
	Diag(Param->getSourceRange().getBegin(),
	diag::err_literal_operator_invalid_param)
	<< ParamType << Param->getSourceRange();
	return true;
	}

	} else if (FnDecl->param_size() == 2) {
	FunctionDecl::param_iterator Param = FnDecl->param_begin();

	// First, verify that the first parameter is correct.

	QualType FirstParamType = (*Param)->getType().getUnqualifiedType();

	// Two parameter function must have a pointer to const as a
	// first parameter; let's strip those qualifiers.
	const PointerType *PT = FirstParamType->getAs<PointerType>();

	if (!PT) {
	Diag((*Param)->getSourceRange().getBegin(),
	diag::err_literal_operator_param)
	<< FirstParamType << "'const char '" << (Param)->getSourceRange();
	return true;
	}

	QualType PointeeType = PT->getPointeeType();
	// First parameter must be const
	if (!PointeeType.isConstQualified() \|\| PointeeType.isVolatileQualified()) {
	Diag((*Param)->getSourceRange().getBegin(),
	diag::err_literal_operator_param)
	<< FirstParamType << "'const char '" << (Param)->getSourceRange();
	return true;
	}

	QualType InnerType = PointeeType.getUnqualifiedType();
	// Only const char , const wchar_t, const char8_t, const char16_t, and
	// const char32_t* are allowed as the first parameter to a two-parameter
	// function
	if (!(Context.hasSameType(InnerType, Context.CharTy) \|\|
	Context.hasSameType(InnerType, Context.WideCharTy) \|\|
	Context.hasSameType(InnerType, Context.Char8Ty) \|\|
	Context.hasSameType(InnerType, Context.Char16Ty) \|\|
	Context.hasSameType(InnerType, Context.Char32Ty))) {
	Diag((*Param)->getSourceRange().getBegin(),
	diag::err_literal_operator_param)
	<< FirstParamType << "'const char '" << (Param)->getSourceRange();
	return true;
	}

	// Move on to the second and final parameter.
	++Param;

	// The second parameter must be a std::size_t.
	QualType SecondParamType = (*Param)->getType().getUnqualifiedType();
	if (!Context.hasSameType(SecondParamType, Context.getSizeType())) {
	Diag((*Param)->getSourceRange().getBegin(),
	diag::err_literal_operator_param)
	<< SecondParamType << Context.getSizeType()
	<< (*Param)->getSourceRange();
	return true;
	}
	} else {
	Diag(FnDecl->getLocation(), diag::err_literal_operator_bad_param_count);
	return true;
	}

	// Parameters are good.

	// A parameter-declaration-clause containing a default argument is not
	// equivalent to any of the permitted forms.
	for (auto Param : FnDecl->parameters()) {
	if (Param->hasDefaultArg()) {
	Diag(Param->getDefaultArgRange().getBegin(),
	diag::err_literal_operator_default_argument)
	<< Param->getDefaultArgRange();
	break;
	}
	}

	StringRef LiteralName
	= FnDecl->getDeclName().getCXXLiteralIdentifier()->getName();
	if (LiteralName[0] != '_' &&
	!getSourceManager().isInSystemHeader(FnDecl->getLocation())) {
	// C++11 [usrlit.suffix]p1:
	// Literal suffix identifiers that do not start with an underscore
	// are reserved for future standardization.
	Diag(FnDecl->getLocation(), diag::warn_user_literal_reserved)
	<< StringLiteralParser::isValidUDSuffix(getLangOpts(), LiteralName);
	}

	return false;
	}

	/// ActOnStartLinkageSpecification - Parsed the beginning of a C++
	/// linkage specification, including the language and (if present)
	/// the '{'. ExternLoc is the location of the 'extern', Lang is the
	/// language string literal. LBraceLoc, if valid, provides the location of
	/// the '{' brace. Otherwise, this linkage specification does not
	/// have any braces.
	Decl Sema::ActOnStartLinkageSpecification(Scope S, SourceLocation ExternLoc,
	Expr *LangStr,
	SourceLocation LBraceLoc) {
	StringLiteral *Lit = cast<StringLiteral>(LangStr);
	if (!Lit->isAscii()) {
	Diag(LangStr->getExprLoc(), diag::err_language_linkage_spec_not_ascii)
	<< LangStr->getSourceRange();
	return nullptr;
	}

	StringRef Lang = Lit->getString();
	LinkageSpecDecl::LanguageIDs Language;
	if (Lang == "C")
	Language = LinkageSpecDecl::lang_c;
	else if (Lang == "C++")
	Language = LinkageSpecDecl::lang_cxx;
	else {
	Diag(LangStr->getExprLoc(), diag::err_language_linkage_spec_unknown)
	<< LangStr->getSourceRange();
	return nullptr;
	}

	// FIXME: Add all the various semantics of linkage specifications

	LinkageSpecDecl *D = LinkageSpecDecl::Create(Context, CurContext, ExternLoc,
	LangStr->getExprLoc(), Language,
	LBraceLoc.isValid());
	CurContext->addDecl(D);
	PushDeclContext(S, D);
	return D;
	}

	/// ActOnFinishLinkageSpecification - Complete the definition of
	/// the C++ linkage specification LinkageSpec. If RBraceLoc is
	/// valid, it's the position of the closing '}' brace in a linkage
	/// specification that uses braces.
	Decl Sema::ActOnFinishLinkageSpecification(Scope S,
	Decl *LinkageSpec,
	SourceLocation RBraceLoc) {
	if (RBraceLoc.isValid()) {
	LinkageSpecDecl* LSDecl = cast<LinkageSpecDecl>(LinkageSpec);
	LSDecl->setRBraceLoc(RBraceLoc);
	}
	PopDeclContext();
	return LinkageSpec;
	}

	Decl Sema::ActOnEmptyDeclaration(Scope S,
	const ParsedAttributesView &AttrList,
	SourceLocation SemiLoc) {
	Decl *ED = EmptyDecl::Create(Context, CurContext, SemiLoc);
	// Attribute declarations appertain to empty declaration so we handle
	// them here.
	ProcessDeclAttributeList(S, ED, AttrList);

	CurContext->addDecl(ED);
	return ED;
	}

	/// Perform semantic analysis for the variable declaration that
	/// occurs within a C++ catch clause, returning the newly-created
	/// variable.
	VarDecl Sema::BuildExceptionDeclaration(Scope S,
	TypeSourceInfo *TInfo,
	SourceLocation StartLoc,
	SourceLocation Loc,
	IdentifierInfo *Name) {
	bool Invalid = false;
	QualType ExDeclType = TInfo->getType();

	// Arrays and functions decay.
	if (ExDeclType->isArrayType())
	ExDeclType = Context.getArrayDecayedType(ExDeclType);
	else if (ExDeclType->isFunctionType())
	ExDeclType = Context.getPointerType(ExDeclType);

	// C++ 15.3p1: The exception-declaration shall not denote an incomplete type.
	// The exception-declaration shall not denote a pointer or reference to an
	// incomplete type, other than [cv] void*.
	// N2844 forbids rvalue references.
	if (!ExDeclType->isDependentType() && ExDeclType->isRValueReferenceType()) {
	Diag(Loc, diag::err_catch_rvalue_ref);
	Invalid = true;
	}

	if (ExDeclType->isVariablyModifiedType()) {
	Diag(Loc, diag::err_catch_variably_modified) << ExDeclType;
	Invalid = true;
	}

	QualType BaseType = ExDeclType;
	int Mode = 0; // 0 for direct type, 1 for pointer, 2 for reference
	unsigned DK = diag::err_catch_incomplete;
	if (const PointerType *Ptr = BaseType->getAs<PointerType>()) {
	BaseType = Ptr->getPointeeType();
	Mode = 1;
	DK = diag::err_catch_incomplete_ptr;
	} else if (const ReferenceType *Ref = BaseType->getAs<ReferenceType>()) {
	// For the purpose of error recovery, we treat rvalue refs like lvalue refs.
	BaseType = Ref->getPointeeType();
	Mode = 2;
	DK = diag::err_catch_incomplete_ref;
	}
	if (!Invalid && (Mode == 0 \|\| !BaseType->isVoidType()) &&
	!BaseType->isDependentType() && RequireCompleteType(Loc, BaseType, DK))
	Invalid = true;

	if (!Invalid && !ExDeclType->isDependentType() &&
	RequireNonAbstractType(Loc, ExDeclType,
	diag::err_abstract_type_in_decl,
	AbstractVariableType))
	Invalid = true;

	// Only the non-fragile NeXT runtime currently supports C++ catches
	// of ObjC types, and no runtime supports catching ObjC types by value.
	if (!Invalid && getLangOpts().ObjC) {
	QualType T = ExDeclType;
	if (const ReferenceType *RT = T->getAs<ReferenceType>())
	T = RT->getPointeeType();

	if (T->isObjCObjectType()) {
	Diag(Loc, diag::err_objc_object_catch);
	Invalid = true;
	} else if (T->isObjCObjectPointerType()) {
	// FIXME: should this be a test for macosx-fragile specifically?
	if (getLangOpts().ObjCRuntime.isFragile())
	Diag(Loc, diag::warn_objc_pointer_cxx_catch_fragile);
	}
	}

	VarDecl *ExDecl = VarDecl::Create(Context, CurContext, StartLoc, Loc, Name,
	ExDeclType, TInfo, SC_None);
	ExDecl->setExceptionVariable(true);

	// In ARC, infer 'retaining' for variables of retainable type.
	if (getLangOpts().ObjCAutoRefCount && inferObjCARCLifetime(ExDecl))
	Invalid = true;

	if (!Invalid && !ExDeclType->isDependentType()) {
	if (const RecordType *recordType = ExDeclType->getAs<RecordType>()) {
	// Insulate this from anything else we might currently be parsing.
	EnterExpressionEvaluationContext scope(
	*this, ExpressionEvaluationContext::PotentiallyEvaluated);

	// C++ [except.handle]p16:
	// The object declared in an exception-declaration or, if the
	// exception-declaration does not specify a name, a temporary (12.2) is
	// copy-initialized (8.5) from the exception object. [...]
	// The object is destroyed when the handler exits, after the destruction
	// of any automatic objects initialized within the handler.
	//
	// We just pretend to initialize the object with itself, then make sure
	// it can be destroyed later.
	QualType initType = Context.getExceptionObjectType(ExDeclType);

	InitializedEntity entity =
	InitializedEntity::InitializeVariable(ExDecl);
	InitializationKind initKind =
	InitializationKind::CreateCopy(Loc, SourceLocation());

	Expr *opaqueValue =
	new (Context) OpaqueValueExpr(Loc, initType, VK_LValue, OK_Ordinary);
	InitializationSequence sequence(*this, entity, initKind, opaqueValue);
	ExprResult result = sequence.Perform(*this, entity, initKind, opaqueValue);
	if (result.isInvalid())
	Invalid = true;
	else {
	// If the constructor used was non-trivial, set this as the
	// "initializer".
	CXXConstructExpr *construct = result.getAs<CXXConstructExpr>();
	if (!construct->getConstructor()->isTrivial()) {
	Expr *init = MaybeCreateExprWithCleanups(construct);
	ExDecl->setInit(init);
	}

	// And make sure it's destructable.
	FinalizeVarWithDestructor(ExDecl, recordType);
	}
	}
	}

	if (Invalid)
	ExDecl->setInvalidDecl();

	return ExDecl;
	}

	/// ActOnExceptionDeclarator - Parsed the exception-declarator in a C++ catch
	/// handler.
	Decl Sema::ActOnExceptionDeclarator(Scope S, Declarator &D) {
	TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
	bool Invalid = D.isInvalidType();

	// Check for unexpanded parameter packs.
	if (DiagnoseUnexpandedParameterPack(D.getIdentifierLoc(), TInfo,
	UPPC_ExceptionType)) {
	TInfo = Context.getTrivialTypeSourceInfo(Context.IntTy,
	D.getIdentifierLoc());
	Invalid = true;
	}

	IdentifierInfo *II = D.getIdentifier();
	if (NamedDecl *PrevDecl = LookupSingleName(S, II, D.getIdentifierLoc(),
	LookupOrdinaryName,
	ForVisibleRedeclaration)) {
	// The scope should be freshly made just for us. There is just no way
	// it contains any previous declaration, except for function parameters in
	// a function-try-block's catch statement.
	assert(!S->isDeclScope(PrevDecl));
	if (isDeclInScope(PrevDecl, CurContext, S)) {
	Diag(D.getIdentifierLoc(), diag::err_redefinition)
	<< D.getIdentifier();
	Diag(PrevDecl->getLocation(), diag::note_previous_definition);
	Invalid = true;
	} else if (PrevDecl->isTemplateParameter())
	// Maybe we will complain about the shadowed template parameter.
	DiagnoseTemplateParameterShadow(D.getIdentifierLoc(), PrevDecl);
	}

	if (D.getCXXScopeSpec().isSet() && !Invalid) {
	Diag(D.getIdentifierLoc(), diag::err_qualified_catch_declarator)
	<< D.getCXXScopeSpec().getRange();
	Invalid = true;
	}

	VarDecl *ExDecl = BuildExceptionDeclaration(
	S, TInfo, D.getBeginLoc(), D.getIdentifierLoc(), D.getIdentifier());
	if (Invalid)
	ExDecl->setInvalidDecl();

	// Add the exception declaration into this scope.
	if (II)
	PushOnScopeChains(ExDecl, S);
	else
	CurContext->addDecl(ExDecl);

	ProcessDeclAttributes(S, ExDecl, D);
	return ExDecl;
	}

	Decl *Sema::ActOnStaticAssertDeclaration(SourceLocation StaticAssertLoc,
	Expr *AssertExpr,
	Expr *AssertMessageExpr,
	SourceLocation RParenLoc) {
	StringLiteral *AssertMessage =
	AssertMessageExpr ? cast<StringLiteral>(AssertMessageExpr) : nullptr;

	if (DiagnoseUnexpandedParameterPack(AssertExpr, UPPC_StaticAssertExpression))
	return nullptr;

	return BuildStaticAssertDeclaration(StaticAssertLoc, AssertExpr,
	AssertMessage, RParenLoc, false);
	}

	Decl *Sema::BuildStaticAssertDeclaration(SourceLocation StaticAssertLoc,
	Expr *AssertExpr,
	StringLiteral *AssertMessage,
	SourceLocation RParenLoc,
	bool Failed) {
	assert(AssertExpr != nullptr && "Expected non-null condition");
	if (!AssertExpr->isTypeDependent() && !AssertExpr->isValueDependent() &&
	!Failed) {
	// In a static_assert-declaration, the constant-expression shall be a
	// constant expression that can be contextually converted to bool.
	ExprResult Converted = PerformContextuallyConvertToBool(AssertExpr);
	if (Converted.isInvalid())
	Failed = true;

	llvm::APSInt Cond;
	if (!Failed && VerifyIntegerConstantExpression(Converted.get(), &Cond,
	diag::err_static_assert_expression_is_not_constant,
	/AllowFold=/false).isInvalid())
	Failed = true;

	if (!Failed && !Cond) {
	SmallString<256> MsgBuffer;
	llvm::raw_svector_ostream Msg(MsgBuffer);
	if (AssertMessage)
	AssertMessage->printPretty(Msg, nullptr, getPrintingPolicy());

	Expr *InnerCond = nullptr;
	std::string InnerCondDescription;
	std::tie(InnerCond, InnerCondDescription) =
	findFailedBooleanCondition(Converted.get());
	if (InnerCond && !isa<CXXBoolLiteralExpr>(InnerCond)
	&& !isa<IntegerLiteral>(InnerCond)) {
	Diag(StaticAssertLoc, diag::err_static_assert_requirement_failed)
	<< InnerCondDescription << !AssertMessage
	<< Msg.str() << InnerCond->getSourceRange();
	} else {
	Diag(StaticAssertLoc, diag::err_static_assert_failed)
	<< !AssertMessage << Msg.str() << AssertExpr->getSourceRange();
	}
	Failed = true;
	}
	}

	ExprResult FullAssertExpr = ActOnFinishFullExpr(AssertExpr, StaticAssertLoc,
	/DiscardedValue/false,
	/IsConstexpr/true);
	if (FullAssertExpr.isInvalid())
	Failed = true;
	else
	AssertExpr = FullAssertExpr.get();

	Decl *Decl = StaticAssertDecl::Create(Context, CurContext, StaticAssertLoc,
	AssertExpr, AssertMessage, RParenLoc,
	Failed);

	CurContext->addDecl(Decl);
	return Decl;
	}

	/// Perform semantic analysis of the given friend type declaration.
	///
	/// \returns A friend declaration that.
	FriendDecl *Sema::CheckFriendTypeDecl(SourceLocation LocStart,
	SourceLocation FriendLoc,
	TypeSourceInfo *TSInfo) {
	assert(TSInfo && "NULL TypeSourceInfo for friend type declaration");

	QualType T = TSInfo->getType();
	SourceRange TypeRange = TSInfo->getTypeLoc().getLocalSourceRange();

	// C++03 [class.friend]p2:
	// An elaborated-type-specifier shall be used in a friend declaration
	// for a class.*
	//
	// * The class-key of the elaborated-type-specifier is required.
	if (!CodeSynthesisContexts.empty()) {
	// Do not complain about the form of friend template types during any kind
	// of code synthesis. For template instantiation, we will have complained
	// when the template was defined.
	} else {
	if (!T->isElaboratedTypeSpecifier()) {
	// If we evaluated the type to a record type, suggest putting
	// a tag in front.
	if (const RecordType *RT = T->getAs<RecordType>()) {
	RecordDecl *RD = RT->getDecl();

	SmallString<16> InsertionText(" ");
	InsertionText += RD->getKindName();

	Diag(TypeRange.getBegin(),
	getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_unelaborated_friend_type :
	diag::ext_unelaborated_friend_type)
	<< (unsigned) RD->getTagKind()
	<< T
	<< FixItHint::CreateInsertion(getLocForEndOfToken(FriendLoc),
	InsertionText);
	} else {
	Diag(FriendLoc,
	getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_nonclass_type_friend :
	diag::ext_nonclass_type_friend)
	<< T
	<< TypeRange;
	}
	} else if (T->getAs<EnumType>()) {
	Diag(FriendLoc,
	getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_enum_friend :
	diag::ext_enum_friend)
	<< T
	<< TypeRange;
	}

	// C++11 [class.friend]p3:
	// A friend declaration that does not declare a function shall have one
	// of the following forms:
	// friend elaborated-type-specifier ;
	// friend simple-type-specifier ;
	// friend typename-specifier ;
	if (getLangOpts().CPlusPlus11 && LocStart != FriendLoc)
	Diag(FriendLoc, diag::err_friend_not_first_in_declaration) << T;
	}

	// If the type specifier in a friend declaration designates a (possibly
	// cv-qualified) class type, that class is declared as a friend; otherwise,
	// the friend declaration is ignored.
	return FriendDecl::Create(Context, CurContext,
	TSInfo->getTypeLoc().getBeginLoc(), TSInfo,
	FriendLoc);
	}

	/// Handle a friend tag declaration where the scope specifier was
	/// templated.
	Decl Sema::ActOnTemplatedFriendTag(Scope S, SourceLocation FriendLoc,
	unsigned TagSpec, SourceLocation TagLoc,
	CXXScopeSpec &SS, IdentifierInfo *Name,
	SourceLocation NameLoc,
	const ParsedAttributesView &Attr,
	MultiTemplateParamsArg TempParamLists) {
	TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);

	bool IsMemberSpecialization = false;
	bool Invalid = false;

	if (TemplateParameterList *TemplateParams =
	MatchTemplateParametersToScopeSpecifier(
	TagLoc, NameLoc, SS, nullptr, TempParamLists, /friend/ true,
	IsMemberSpecialization, Invalid)) {
	if (TemplateParams->size() > 0) {
	// This is a declaration of a class template.
	if (Invalid)
	return nullptr;

	return CheckClassTemplate(S, TagSpec, TUK_Friend, TagLoc, SS, Name,
	NameLoc, Attr, TemplateParams, AS_public,
	/ModulePrivateLoc=/SourceLocation(),
	FriendLoc, TempParamLists.size() - 1,
	TempParamLists.data()).get();
	} else {
	// The "template<>" header is extraneous.
	Diag(TemplateParams->getTemplateLoc(), diag::err_template_tag_noparams)
	<< TypeWithKeyword::getTagTypeKindName(Kind) << Name;
	IsMemberSpecialization = true;
	}
	}

	if (Invalid) return nullptr;

	bool isAllExplicitSpecializations = true;
	for (unsigned I = TempParamLists.size(); I-- > 0; ) {
	if (TempParamLists[I]->size()) {
	isAllExplicitSpecializations = false;
	break;
	}
	}

	// FIXME: don't ignore attributes.

	// If it's explicit specializations all the way down, just forget
	// about the template header and build an appropriate non-templated
	// friend. TODO: for source fidelity, remember the headers.
	if (isAllExplicitSpecializations) {
	if (SS.isEmpty()) {
	bool Owned = false;
	bool IsDependent = false;
	return ActOnTag(S, TagSpec, TUK_Friend, TagLoc, SS, Name, NameLoc,
	Attr, AS_public,
	/ModulePrivateLoc=/SourceLocation(),
	MultiTemplateParamsArg(), Owned, IsDependent,
	/ScopedEnumKWLoc=/SourceLocation(),
	/ScopedEnumUsesClassTag=/false,
	/UnderlyingType=/TypeResult(),
	/IsTypeSpecifier=/false,
	/IsTemplateParamOrArg=/false);
	}

	NestedNameSpecifierLoc QualifierLoc = SS.getWithLocInContext(Context);
	ElaboratedTypeKeyword Keyword
	= TypeWithKeyword::getKeywordForTagTypeKind(Kind);
	QualType T = CheckTypenameType(Keyword, TagLoc, QualifierLoc,
	*Name, NameLoc);
	if (T.isNull())
	return nullptr;

	TypeSourceInfo *TSI = Context.CreateTypeSourceInfo(T);
	if (isa<DependentNameType>(T)) {
	DependentNameTypeLoc TL =
	TSI->getTypeLoc().castAs<DependentNameTypeLoc>();
	TL.setElaboratedKeywordLoc(TagLoc);
	TL.setQualifierLoc(QualifierLoc);
	TL.setNameLoc(NameLoc);
	} else {
	ElaboratedTypeLoc TL = TSI->getTypeLoc().castAs<ElaboratedTypeLoc>();
	TL.setElaboratedKeywordLoc(TagLoc);
	TL.setQualifierLoc(QualifierLoc);
	TL.getNamedTypeLoc().castAs<TypeSpecTypeLoc>().setNameLoc(NameLoc);
	}

	FriendDecl *Friend = FriendDecl::Create(Context, CurContext, NameLoc,
	TSI, FriendLoc, TempParamLists);
	Friend->setAccess(AS_public);
	CurContext->addDecl(Friend);
	return Friend;
	}

	assert(SS.isNotEmpty() && "valid templated tag with no SS and no direct?");



	// Handle the case of a templated-scope friend class. e.g.
	// template <class T> class A<T>::B;
	// FIXME: we don't support these right now.
	Diag(NameLoc, diag::warn_template_qualified_friend_unsupported)
	<< SS.getScopeRep() << SS.getRange() << cast<CXXRecordDecl>(CurContext);
	ElaboratedTypeKeyword ETK = TypeWithKeyword::getKeywordForTagTypeKind(Kind);
	QualType T = Context.getDependentNameType(ETK, SS.getScopeRep(), Name);
	TypeSourceInfo *TSI = Context.CreateTypeSourceInfo(T);
	DependentNameTypeLoc TL = TSI->getTypeLoc().castAs<DependentNameTypeLoc>();
	TL.setElaboratedKeywordLoc(TagLoc);
	TL.setQualifierLoc(SS.getWithLocInContext(Context));
	TL.setNameLoc(NameLoc);

	FriendDecl *Friend = FriendDecl::Create(Context, CurContext, NameLoc,
	TSI, FriendLoc, TempParamLists);
	Friend->setAccess(AS_public);
	Friend->setUnsupportedFriend(true);
	CurContext->addDecl(Friend);
	return Friend;
	}

	/// Handle a friend type declaration. This works in tandem with
	/// ActOnTag.
	///
	/// Notes on friend class templates:
	///
	/// We generally treat friend class declarations as if they were
	/// declaring a class. So, for example, the elaborated type specifier
	/// in a friend declaration is required to obey the restrictions of a
	/// class-head (i.e. no typedefs in the scope chain), template
	/// parameters are required to match up with simple template-ids, &c.
	/// However, unlike when declaring a template specialization, it's
	/// okay to refer to a template specialization without an empty
	/// template parameter declaration, e.g.
	/// friend class A<T>::B<unsigned>;
	/// We permit this as a special case; if there are any template
	/// parameters present at all, require proper matching, i.e.
	/// template <> template \<class T> friend class A<int>::B;
	Decl Sema::ActOnFriendTypeDecl(Scope S, const DeclSpec &DS,
	MultiTemplateParamsArg TempParams) {
	SourceLocation Loc = DS.getBeginLoc();

	assert(DS.isFriendSpecified());
	assert(DS.getStorageClassSpec() == DeclSpec::SCS_unspecified);

	// C++ [class.friend]p3:
	// A friend declaration that does not declare a function shall have one of
	// the following forms:
	// friend elaborated-type-specifier ;
	// friend simple-type-specifier ;
	// friend typename-specifier ;
	//
	// Any declaration with a type qualifier does not have that form. (It's
	// legal to specify a qualified type as a friend, you just can't write the
	// keywords.)
	if (DS.getTypeQualifiers()) {
	if (DS.getTypeQualifiers() & DeclSpec::TQ_const)
	Diag(DS.getConstSpecLoc(), diag::err_friend_decl_spec) << "const";
	if (DS.getTypeQualifiers() & DeclSpec::TQ_volatile)
	Diag(DS.getVolatileSpecLoc(), diag::err_friend_decl_spec) << "volatile";
	if (DS.getTypeQualifiers() & DeclSpec::TQ_restrict)
	Diag(DS.getRestrictSpecLoc(), diag::err_friend_decl_spec) << "restrict";
	if (DS.getTypeQualifiers() & DeclSpec::TQ_atomic)
	Diag(DS.getAtomicSpecLoc(), diag::err_friend_decl_spec) << "_Atomic";
	if (DS.getTypeQualifiers() & DeclSpec::TQ_unaligned)
	Diag(DS.getUnalignedSpecLoc(), diag::err_friend_decl_spec) << "__unaligned";
	}

	// Try to convert the decl specifier to a type. This works for
	// friend templates because ActOnTag never produces a ClassTemplateDecl
	// for a TUK_Friend.
	Declarator TheDeclarator(DS, DeclaratorContext::MemberContext);
	TypeSourceInfo *TSI = GetTypeForDeclarator(TheDeclarator, S);
	QualType T = TSI->getType();
	if (TheDeclarator.isInvalidType())
	return nullptr;

	if (DiagnoseUnexpandedParameterPack(Loc, TSI, UPPC_FriendDeclaration))
	return nullptr;

	// This is definitely an error in C++98. It's probably meant to
	// be forbidden in C++0x, too, but the specification is just
	// poorly written.
	//
	// The problem is with declarations like the following:
	// template <T> friend A<T>::foo;
	// where deciding whether a class C is a friend or not now hinges
	// on whether there exists an instantiation of A that causes
	// 'foo' to equal C. There are restrictions on class-heads
	// (which we declare (by fiat) elaborated friend declarations to
	// be) that makes this tractable.
	//
	// FIXME: handle "template <> friend class A<T>;", which
	// is possibly well-formed? Who even knows?
	if (TempParams.size() && !T->isElaboratedTypeSpecifier()) {
	Diag(Loc, diag::err_tagless_friend_type_template)
	<< DS.getSourceRange();
	return nullptr;
	}

	// C++98 [class.friend]p1: A friend of a class is a function
	// or class that is not a member of the class . . .
	// This is fixed in DR77, which just barely didn't make the C++03
	// deadline. It's also a very silly restriction that seriously
	// affects inner classes and which nobody else seems to implement;
	// thus we never diagnose it, not even in -pedantic.
	//
	// But note that we could warn about it: it's always useless to
	// friend one of your own members (it's not, however, worthless to
	// friend a member of an arbitrary specialization of your template).

	Decl *D;
	if (!TempParams.empty())
	D = FriendTemplateDecl::Create(Context, CurContext, Loc,
	TempParams,
	TSI,
	DS.getFriendSpecLoc());
	else
	D = CheckFriendTypeDecl(Loc, DS.getFriendSpecLoc(), TSI);

	if (!D)
	return nullptr;

	D->setAccess(AS_public);
	CurContext->addDecl(D);

	return D;
	}

	NamedDecl Sema::ActOnFriendFunctionDecl(Scope S, Declarator &D,
	MultiTemplateParamsArg TemplateParams) {
	const DeclSpec &DS = D.getDeclSpec();

	assert(DS.isFriendSpecified());
	assert(DS.getStorageClassSpec() == DeclSpec::SCS_unspecified);

	SourceLocation Loc = D.getIdentifierLoc();
	TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);

	// C++ [class.friend]p1
	// A friend of a class is a function or class....
	// Note that this sees through typedefs, which is intended.
	// It doesn't see through dependent types, which is correct
	// according to [temp.arg.type]p3:
	// If a declaration acquires a function type through a
	// type dependent on a template-parameter and this causes
	// a declaration that does not use the syntactic form of a
	// function declarator to have a function type, the program
	// is ill-formed.
	if (!TInfo->getType()->isFunctionType()) {
	Diag(Loc, diag::err_unexpected_friend);

	// It might be worthwhile to try to recover by creating an
	// appropriate declaration.
	return nullptr;
	}

	// C++ [namespace.memdef]p3
	// - If a friend declaration in a non-local class first declares a
	// class or function, the friend class or function is a member
	// of the innermost enclosing namespace.
	// - The name of the friend is not found by simple name lookup
	// until a matching declaration is provided in that namespace
	// scope (either before or after the class declaration granting
	// friendship).
	// - If a friend function is called, its name may be found by the
	// name lookup that considers functions from namespaces and
	// classes associated with the types of the function arguments.
	// - When looking for a prior declaration of a class or a function
	// declared as a friend, scopes outside the innermost enclosing
	// namespace scope are not considered.

	CXXScopeSpec &SS = D.getCXXScopeSpec();
	DeclarationNameInfo NameInfo = GetNameForDeclarator(D);
	assert(NameInfo.getName());

	// Check for unexpanded parameter packs.
	if (DiagnoseUnexpandedParameterPack(Loc, TInfo, UPPC_FriendDeclaration) \|\|
	DiagnoseUnexpandedParameterPack(NameInfo, UPPC_FriendDeclaration) \|\|
	DiagnoseUnexpandedParameterPack(SS, UPPC_FriendDeclaration))
	return nullptr;

	// The context we found the declaration in, or in which we should
	// create the declaration.
	DeclContext *DC;
	Scope *DCScope = S;
	LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
	ForExternalRedeclaration);

	// There are five cases here.
	// - There's no scope specifier and we're in a local class. Only look
	// for functions declared in the immediately-enclosing block scope.
	// We recover from invalid scope qualifiers as if they just weren't there.
	FunctionDecl *FunctionContainingLocalClass = nullptr;
	if ((SS.isInvalid() \|\| !SS.isSet()) &&
	(FunctionContainingLocalClass =
	cast<CXXRecordDecl>(CurContext)->isLocalClass())) {
	// C++11 [class.friend]p11:
	// If a friend declaration appears in a local class and the name
	// specified is an unqualified name, a prior declaration is
	// looked up without considering scopes that are outside the
	// innermost enclosing non-class scope. For a friend function
	// declaration, if there is no prior declaration, the program is
	// ill-formed.

	// Find the innermost enclosing non-class scope. This is the block
	// scope containing the local class definition (or for a nested class,
	// the outer local class).
	DCScope = S->getFnParent();

	// Look up the function name in the scope.
	Previous.clear(LookupLocalFriendName);
	LookupName(Previous, S, /AllowBuiltinCreation/false);

	if (!Previous.empty()) {
	// All possible previous declarations must have the same context:
	// either they were declared at block scope or they are members of
	// one of the enclosing local classes.
	DC = Previous.getRepresentativeDecl()->getDeclContext();
	} else {
	// This is ill-formed, but provide the context that we would have
	// declared the function in, if we were permitted to, for error recovery.
	DC = FunctionContainingLocalClass;
	}
	adjustContextForLocalExternDecl(DC);

	// C++ [class.friend]p6:
	// A function can be defined in a friend declaration of a class if and
	// only if the class is a non-local class (9.8), the function name is
	// unqualified, and the function has namespace scope.
	if (D.isFunctionDefinition()) {
	Diag(NameInfo.getBeginLoc(), diag::err_friend_def_in_local_class);
	}

	// - There's no scope specifier, in which case we just go to the
	// appropriate scope and look for a function or function template
	// there as appropriate.
	} else if (SS.isInvalid() \|\| !SS.isSet()) {
	// C++11 [namespace.memdef]p3:
	// If the name in a friend declaration is neither qualified nor
	// a template-id and the declaration is a function or an
	// elaborated-type-specifier, the lookup to determine whether
	// the entity has been previously declared shall not consider
	// any scopes outside the innermost enclosing namespace.
	bool isTemplateId =
	D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId;

	// Find the appropriate context according to the above.
	DC = CurContext;

	// Skip class contexts. If someone can cite chapter and verse
	// for this behavior, that would be nice --- it's what GCC and
	// EDG do, and it seems like a reasonable intent, but the spec
	// really only says that checks for unqualified existing
	// declarations should stop at the nearest enclosing namespace,
	// not that they should only consider the nearest enclosing
	// namespace.
	while (DC->isRecord())
	DC = DC->getParent();

	DeclContext *LookupDC = DC;
	while (LookupDC->isTransparentContext())
	LookupDC = LookupDC->getParent();

	while (true) {
	LookupQualifiedName(Previous, LookupDC);

	if (!Previous.empty()) {
	DC = LookupDC;
	break;
	}

	if (isTemplateId) {
	if (isa<TranslationUnitDecl>(LookupDC)) break;
	} else {
	if (LookupDC->isFileContext()) break;
	}
	LookupDC = LookupDC->getParent();
	}

	DCScope = getScopeForDeclContext(S, DC);

	// - There's a non-dependent scope specifier, in which case we
	// compute it and do a previous lookup there for a function
	// or function template.
	} else if (!SS.getScopeRep()->isDependent()) {
	DC = computeDeclContext(SS);
	if (!DC) return nullptr;

	if (RequireCompleteDeclContext(SS, DC)) return nullptr;

	LookupQualifiedName(Previous, DC);

	// C++ [class.friend]p1: A friend of a class is a function or
	// class that is not a member of the class . . .
	if (DC->Equals(CurContext))
	Diag(DS.getFriendSpecLoc(),
	getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_friend_is_member :
	diag::err_friend_is_member);

	if (D.isFunctionDefinition()) {
	// C++ [class.friend]p6:
	// A function can be defined in a friend declaration of a class if and
	// only if the class is a non-local class (9.8), the function name is
	// unqualified, and the function has namespace scope.
	//
	// FIXME: We should only do this if the scope specifier names the
	// innermost enclosing namespace; otherwise the fixit changes the
	// meaning of the code.
	SemaDiagnosticBuilder DB
	= Diag(SS.getRange().getBegin(), diag::err_qualified_friend_def);

	DB << SS.getScopeRep();
	if (DC->isFileContext())
	DB << FixItHint::CreateRemoval(SS.getRange());
	SS.clear();
	}

	// - There's a scope specifier that does not match any template
	// parameter lists, in which case we use some arbitrary context,
	// create a method or method template, and wait for instantiation.
	// - There's a scope specifier that does match some template
	// parameter lists, which we don't handle right now.
	} else {
	if (D.isFunctionDefinition()) {
	// C++ [class.friend]p6:
	// A function can be defined in a friend declaration of a class if and
	// only if the class is a non-local class (9.8), the function name is
	// unqualified, and the function has namespace scope.
	Diag(SS.getRange().getBegin(), diag::err_qualified_friend_def)
	<< SS.getScopeRep();
	}

	DC = CurContext;
	assert(isa<CXXRecordDecl>(DC) && "friend declaration not in class?");
	}

	if (!DC->isRecord()) {
	int DiagArg = -1;
	switch (D.getName().getKind()) {
	case UnqualifiedIdKind::IK_ConstructorTemplateId:
	case UnqualifiedIdKind::IK_ConstructorName:
	DiagArg = 0;
	break;
	case UnqualifiedIdKind::IK_DestructorName:
	DiagArg = 1;
	break;
	case UnqualifiedIdKind::IK_ConversionFunctionId:
	DiagArg = 2;
	break;
	case UnqualifiedIdKind::IK_DeductionGuideName:
	DiagArg = 3;
	break;
	case UnqualifiedIdKind::IK_Identifier:
	case UnqualifiedIdKind::IK_ImplicitSelfParam:
	case UnqualifiedIdKind::IK_LiteralOperatorId:
	case UnqualifiedIdKind::IK_OperatorFunctionId:
	case UnqualifiedIdKind::IK_TemplateId:
	break;
	}
	// This implies that it has to be an operator or function.
	if (DiagArg >= 0) {
	Diag(Loc, diag::err_introducing_special_friend) << DiagArg;
	return nullptr;
	}
	}

	// FIXME: This is an egregious hack to cope with cases where the scope stack
	// does not contain the declaration context, i.e., in an out-of-line
	// definition of a class.
	Scope FakeDCScope(S, Scope::DeclScope, Diags);
	if (!DCScope) {
	FakeDCScope.setEntity(DC);
	DCScope = &FakeDCScope;
	}

	bool AddToScope = true;
	NamedDecl *ND = ActOnFunctionDeclarator(DCScope, D, DC, TInfo, Previous,
	TemplateParams, AddToScope);
	if (!ND) return nullptr;

	assert(ND->getLexicalDeclContext() == CurContext);

	// If we performed typo correction, we might have added a scope specifier
	// and changed the decl context.
	DC = ND->getDeclContext();

	// Add the function declaration to the appropriate lookup tables,
	// adjusting the redeclarations list as necessary. We don't
	// want to do this yet if the friending class is dependent.
	//
	// Also update the scope-based lookup if the target context's
	// lookup context is in lexical scope.
	if (!CurContext->isDependentContext()) {
	DC = DC->getRedeclContext();
	DC->makeDeclVisibleInContext(ND);
	if (Scope *EnclosingScope = getScopeForDeclContext(S, DC))
	PushOnScopeChains(ND, EnclosingScope, /AddToContext=/ false);
	}

	FriendDecl *FrD = FriendDecl::Create(Context, CurContext,
	D.getIdentifierLoc(), ND,
	DS.getFriendSpecLoc());
	FrD->setAccess(AS_public);
	CurContext->addDecl(FrD);

	if (ND->isInvalidDecl()) {
	FrD->setInvalidDecl();
	} else {
	if (DC->isRecord()) CheckFriendAccess(ND);

	FunctionDecl *FD;
	if (FunctionTemplateDecl *FTD = dyn_cast<FunctionTemplateDecl>(ND))
	FD = FTD->getTemplatedDecl();
	else
	FD = cast<FunctionDecl>(ND);

	// C++11 [dcl.fct.default]p4: If a friend declaration specifies a
	// default argument expression, that declaration shall be a definition
	// and shall be the only declaration of the function or function
	// template in the translation unit.
	if (functionDeclHasDefaultArgument(FD)) {
	// We can't look at FD->getPreviousDecl() because it may not have been set
	// if we're in a dependent context. If the function is known to be a
	// redeclaration, we will have narrowed Previous down to the right decl.
	if (D.isRedeclaration()) {
	Diag(FD->getLocation(), diag::err_friend_decl_with_def_arg_redeclared);
	Diag(Previous.getRepresentativeDecl()->getLocation(),
	diag::note_previous_declaration);
	} else if (!D.isFunctionDefinition())
	Diag(FD->getLocation(), diag::err_friend_decl_with_def_arg_must_be_def);
	}

	// Mark templated-scope function declarations as unsupported.
	if (FD->getNumTemplateParameterLists() && SS.isValid()) {
	Diag(FD->getLocation(), diag::warn_template_qualified_friend_unsupported)
	<< SS.getScopeRep() << SS.getRange()
	<< cast<CXXRecordDecl>(CurContext);
	FrD->setUnsupportedFriend(true);
	}
	}

	return ND;
	}

	void Sema::SetDeclDeleted(Decl *Dcl, SourceLocation DelLoc) {
	AdjustDeclIfTemplate(Dcl);

	FunctionDecl *Fn = dyn_cast_or_null<FunctionDecl>(Dcl);
	if (!Fn) {
	Diag(DelLoc, diag::err_deleted_non_function);
	return;
	}

	// Deleted function does not have a body.
	Fn->setWillHaveBody(false);

	if (const FunctionDecl *Prev = Fn->getPreviousDecl()) {
	// Don't consider the implicit declaration we generate for explicit
	// specializations. FIXME: Do not generate these implicit declarations.
	if ((Prev->getTemplateSpecializationKind() != TSK_ExplicitSpecialization \|\|
	Prev->getPreviousDecl()) &&
	!Prev->isDefined()) {
	Diag(DelLoc, diag::err_deleted_decl_not_first);
	Diag(Prev->getLocation().isInvalid() ? DelLoc : Prev->getLocation(),
	Prev->isImplicit() ? diag::note_previous_implicit_declaration
	: diag::note_previous_declaration);
	}
	// If the declaration wasn't the first, we delete the function anyway for
	// recovery.
	Fn = Fn->getCanonicalDecl();
	}

	// dllimport/dllexport cannot be deleted.
	if (const InheritableAttr *DLLAttr = getDLLAttr(Fn)) {
	Diag(Fn->getLocation(), diag::err_attribute_dll_deleted) << DLLAttr;
	Fn->setInvalidDecl();
	}

	if (Fn->isDeleted())
	return;

	// See if we're deleting a function which is already known to override a
	// non-deleted virtual function.
	if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(Fn)) {
	bool IssuedDiagnostic = false;
	for (const CXXMethodDecl *O : MD->overridden_methods()) {
	if (!(*MD->begin_overridden_methods())->isDeleted()) {
	if (!IssuedDiagnostic) {
	Diag(DelLoc, diag::err_deleted_override) << MD->getDeclName();
	IssuedDiagnostic = true;
	}
	Diag(O->getLocation(), diag::note_overridden_virtual_function);
	}
	}
	// If this function was implicitly deleted because it was defaulted,
	// explain why it was deleted.
	if (IssuedDiagnostic && MD->isDefaulted())
	ShouldDeleteSpecialMember(MD, getSpecialMember(MD), nullptr,
	/Diagnose/true);
	}

	// C++11 [basic.start.main]p3:
	// A program that defines main as deleted [...] is ill-formed.
	if (Fn->isMain())
	Diag(DelLoc, diag::err_deleted_main);

	// C++11 [dcl.fct.def.delete]p4:
	// A deleted function is implicitly inline.
	Fn->setImplicitlyInline();
	Fn->setDeletedAsWritten();
	}

	void Sema::SetDeclDefaulted(Decl *Dcl, SourceLocation DefaultLoc) {
	CXXMethodDecl *MD = dyn_cast_or_null<CXXMethodDecl>(Dcl);

	if (MD) {
	if (MD->getParent()->isDependentType()) {
	MD->setDefaulted();
	MD->setExplicitlyDefaulted();
	return;
	}

	CXXSpecialMember Member = getSpecialMember(MD);
	if (Member == CXXInvalid) {
	if (!MD->isInvalidDecl())
	Diag(DefaultLoc, diag::err_default_special_members);
	return;
	}

	MD->setDefaulted();
	MD->setExplicitlyDefaulted();

	// Unset that we will have a body for this function. We might not,
	// if it turns out to be trivial, and we don't need this marking now
	// that we've marked it as defaulted.
	MD->setWillHaveBody(false);

	// If this definition appears within the record, do the checking when
	// the record is complete.
	const FunctionDecl *Primary = MD;
	if (const FunctionDecl *Pattern = MD->getTemplateInstantiationPattern())
	// Ask the template instantiation pattern that actually had the
	// '= default' on it.
	Primary = Pattern;

	// If the method was defaulted on its first declaration, we will have
	// already performed the checking in CheckCompletedCXXClass. Such a
	// declaration doesn't trigger an implicit definition.
	if (Primary->getCanonicalDecl()->isDefaulted())
	return;

	CheckExplicitlyDefaultedSpecialMember(MD);

	if (!MD->isInvalidDecl())
	DefineImplicitSpecialMember(*this, MD, DefaultLoc);
	} else {
	Diag(DefaultLoc, diag::err_default_special_members);
	}
	}

	static void SearchForReturnInStmt(Sema &Self, Stmt *S) {
	for (Stmt *SubStmt : S->children()) {
	if (!SubStmt)
	continue;
	if (isa<ReturnStmt>(SubStmt))
	Self.Diag(SubStmt->getBeginLoc(),
	diag::err_return_in_constructor_handler);
	if (!isa<Expr>(SubStmt))
	SearchForReturnInStmt(Self, SubStmt);
	}
	}

	void Sema::DiagnoseReturnInConstructorExceptionHandler(CXXTryStmt *TryBlock) {
	for (unsigned I = 0, E = TryBlock->getNumHandlers(); I != E; ++I) {
	CXXCatchStmt *Handler = TryBlock->getHandler(I);
	SearchForReturnInStmt(*this, Handler);
	}
	}

	bool Sema::CheckOverridingFunctionAttributes(const CXXMethodDecl *New,
	const CXXMethodDecl *Old) {
	const auto *NewFT = New->getType()->getAs<FunctionProtoType>();
	const auto *OldFT = Old->getType()->getAs<FunctionProtoType>();

	if (OldFT->hasExtParameterInfos()) {
	for (unsigned I = 0, E = OldFT->getNumParams(); I != E; ++I)
	// A parameter of the overriding method should be annotated with noescape
	// if the corresponding parameter of the overridden method is annotated.
	if (OldFT->getExtParameterInfo(I).isNoEscape() &&
	!NewFT->getExtParameterInfo(I).isNoEscape()) {
	Diag(New->getParamDecl(I)->getLocation(),
	diag::warn_overriding_method_missing_noescape);
	Diag(Old->getParamDecl(I)->getLocation(),
	diag::note_overridden_marked_noescape);
	}
	}

	// Virtual overrides must have the same code_seg.
	const auto *OldCSA = Old->getAttr<CodeSegAttr>();
	const auto *NewCSA = New->getAttr<CodeSegAttr>();
	if ((NewCSA \|\| OldCSA) &&
	(!OldCSA \|\| !NewCSA \|\| NewCSA->getName() != OldCSA->getName())) {
	Diag(New->getLocation(), diag::err_mismatched_code_seg_override);
	Diag(Old->getLocation(), diag::note_previous_declaration);
	return true;
	}

	CallingConv NewCC = NewFT->getCallConv(), OldCC = OldFT->getCallConv();

	// If the calling conventions match, everything is fine
	if (NewCC == OldCC)
	return false;

	// If the calling conventions mismatch because the new function is static,
	// suppress the calling convention mismatch error; the error about static
	// function override (err_static_overrides_virtual from
	// Sema::CheckFunctionDeclaration) is more clear.
	if (New->getStorageClass() == SC_Static)
	return false;

	Diag(New->getLocation(),
	diag::err_conflicting_overriding_cc_attributes)
	<< New->getDeclName() << New->getType() << Old->getType();
	Diag(Old->getLocation(), diag::note_overridden_virtual_function);
	return true;
	}

	bool Sema::CheckOverridingFunctionReturnType(const CXXMethodDecl *New,
	const CXXMethodDecl *Old) {
	QualType NewTy = New->getType()->getAs<FunctionType>()->getReturnType();
	QualType OldTy = Old->getType()->getAs<FunctionType>()->getReturnType();

	if (Context.hasSameType(NewTy, OldTy) \|\|
	NewTy->isDependentType() \|\| OldTy->isDependentType())
	return false;

	// Check if the return types are covariant
	QualType NewClassTy, OldClassTy;

	/// Both types must be pointers or references to classes.
	if (const PointerType *NewPT = NewTy->getAs<PointerType>()) {
	if (const PointerType *OldPT = OldTy->getAs<PointerType>()) {
	NewClassTy = NewPT->getPointeeType();
	OldClassTy = OldPT->getPointeeType();
	}
	} else if (const ReferenceType *NewRT = NewTy->getAs<ReferenceType>()) {
	if (const ReferenceType *OldRT = OldTy->getAs<ReferenceType>()) {
	if (NewRT->getTypeClass() == OldRT->getTypeClass()) {
	NewClassTy = NewRT->getPointeeType();
	OldClassTy = OldRT->getPointeeType();
	}
	}
	}

	// The return types aren't either both pointers or references to a class type.
	if (NewClassTy.isNull()) {
	Diag(New->getLocation(),
	diag::err_different_return_type_for_overriding_virtual_function)
	<< New->getDeclName() << NewTy << OldTy
	<< New->getReturnTypeSourceRange();
	Diag(Old->getLocation(), diag::note_overridden_virtual_function)
	<< Old->getReturnTypeSourceRange();

	return true;
	}

	if (!Context.hasSameUnqualifiedType(NewClassTy, OldClassTy)) {
	// C++14 [class.virtual]p8:
	// If the class type in the covariant return type of D::f differs from
	// that of B::f, the class type in the return type of D::f shall be
	// complete at the point of declaration of D::f or shall be the class
	// type D.
	if (const RecordType *RT = NewClassTy->getAs<RecordType>()) {
	if (!RT->isBeingDefined() &&
	RequireCompleteType(New->getLocation(), NewClassTy,
	diag::err_covariant_return_incomplete,
	New->getDeclName()))
	return true;
	}

	// Check if the new class derives from the old class.
	if (!IsDerivedFrom(New->getLocation(), NewClassTy, OldClassTy)) {
	Diag(New->getLocation(), diag::err_covariant_return_not_derived)
	<< New->getDeclName() << NewTy << OldTy
	<< New->getReturnTypeSourceRange();
	Diag(Old->getLocation(), diag::note_overridden_virtual_function)
	<< Old->getReturnTypeSourceRange();
	return true;
	}

	// Check if we the conversion from derived to base is valid.
	if (CheckDerivedToBaseConversion(
	NewClassTy, OldClassTy,
	diag::err_covariant_return_inaccessible_base,
	diag::err_covariant_return_ambiguous_derived_to_base_conv,
	New->getLocation(), New->getReturnTypeSourceRange(),
	New->getDeclName(), nullptr)) {
	// FIXME: this note won't trigger for delayed access control
	// diagnostics, and it's impossible to get an undelayed error
	// here from access control during the original parse because
	// the ParsingDeclSpec/ParsingDeclarator are still in scope.
	Diag(Old->getLocation(), diag::note_overridden_virtual_function)
	<< Old->getReturnTypeSourceRange();
	return true;
	}
	}

	// The qualifiers of the return types must be the same.
	if (NewTy.getLocalCVRQualifiers() != OldTy.getLocalCVRQualifiers()) {
	Diag(New->getLocation(),
	diag::err_covariant_return_type_different_qualifications)
	<< New->getDeclName() << NewTy << OldTy
	<< New->getReturnTypeSourceRange();
	Diag(Old->getLocation(), diag::note_overridden_virtual_function)
	<< Old->getReturnTypeSourceRange();
	return true;
	}


	// The new class type must have the same or less qualifiers as the old type.
	if (NewClassTy.isMoreQualifiedThan(OldClassTy)) {
	Diag(New->getLocation(),
	diag::err_covariant_return_type_class_type_more_qualified)
	<< New->getDeclName() << NewTy << OldTy
	<< New->getReturnTypeSourceRange();
	Diag(Old->getLocation(), diag::note_overridden_virtual_function)
	<< Old->getReturnTypeSourceRange();
	return true;
	}

	return false;
	}

	/// Mark the given method pure.
	///
	/// \param Method the method to be marked pure.
	///
	/// \param InitRange the source range that covers the "0" initializer.
	bool Sema::CheckPureMethod(CXXMethodDecl *Method, SourceRange InitRange) {
	SourceLocation EndLoc = InitRange.getEnd();
	if (EndLoc.isValid())
	Method->setRangeEnd(EndLoc);

	if (Method->isVirtual() \|\| Method->getParent()->isDependentContext()) {
	Method->setPure();
	return false;
	}

	if (!Method->isInvalidDecl())
	Diag(Method->getLocation(), diag::err_non_virtual_pure)
	<< Method->getDeclName() << InitRange;
	return true;
	}

	void Sema::ActOnPureSpecifier(Decl *D, SourceLocation ZeroLoc) {
	if (D->getFriendObjectKind())
	Diag(D->getLocation(), diag::err_pure_friend);
	else if (auto *M = dyn_cast<CXXMethodDecl>(D))
	CheckPureMethod(M, ZeroLoc);
	else
	Diag(D->getLocation(), diag::err_illegal_initializer);
	}

	/// Determine whether the given declaration is a global variable or
	/// static data member.
	static bool isNonlocalVariable(const Decl *D) {
	if (const VarDecl *Var = dyn_cast_or_null<VarDecl>(D))
	return Var->hasGlobalStorage();

	return false;
	}

	/// Invoked when we are about to parse an initializer for the declaration
	/// 'Dcl'.
	///
	/// After this method is called, according to [C++ 3.4.1p13], if 'Dcl' is a
	/// static data member of class X, names should be looked up in the scope of
	/// class X. If the declaration had a scope specifier, a scope will have
	/// been created and passed in for this purpose. Otherwise, S will be null.
	void Sema::ActOnCXXEnterDeclInitializer(Scope S, Decl D) {
	// If there is no declaration, there was an error parsing it.
	if (!D \|\| D->isInvalidDecl())
	return;

	// We will always have a nested name specifier here, but this declaration
	// might not be out of line if the specifier names the current namespace:
	// extern int n;
	// int ::n = 0;
	if (S && D->isOutOfLine())
	EnterDeclaratorContext(S, D->getDeclContext());

	// If we are parsing the initializer for a static data member, push a
	// new expression evaluation context that is associated with this static
	// data member.
	if (isNonlocalVariable(D))
	PushExpressionEvaluationContext(
	ExpressionEvaluationContext::PotentiallyEvaluated, D);
	}

	/// Invoked after we are finished parsing an initializer for the declaration D.
	void Sema::ActOnCXXExitDeclInitializer(Scope S, Decl D) {
	// If there is no declaration, there was an error parsing it.
	if (!D \|\| D->isInvalidDecl())
	return;

	if (isNonlocalVariable(D))
	PopExpressionEvaluationContext();

	if (S && D->isOutOfLine())
	ExitDeclaratorContext(S);
	}

	/// ActOnCXXConditionDeclarationExpr - Parsed a condition declaration of a
	/// C++ if/switch/while/for statement.
	/// e.g: "if (int x = f()) {...}"
	DeclResult Sema::ActOnCXXConditionDeclaration(Scope *S, Declarator &D) {
	// C++ 6.4p2:
	// The declarator shall not specify a function or an array.
	// The type-specifier-seq shall not contain typedef and shall not declare a
	// new class or enumeration.
	assert(D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_typedef &&
	"Parser allowed 'typedef' as storage class of condition decl.");

	Decl *Dcl = ActOnDeclarator(S, D);
	if (!Dcl)
	return true;

	if (isa<FunctionDecl>(Dcl)) { // The declarator shall not specify a function.
	Diag(Dcl->getLocation(), diag::err_invalid_use_of_function_type)
	<< D.getSourceRange();
	return true;
	}

	return Dcl;
	}

	void Sema::LoadExternalVTableUses() {
	if (!ExternalSource)
	return;

	SmallVector<ExternalVTableUse, 4> VTables;
	ExternalSource->ReadUsedVTables(VTables);
	SmallVector<VTableUse, 4> NewUses;
	for (unsigned I = 0, N = VTables.size(); I != N; ++I) {
	llvm::DenseMap<CXXRecordDecl *, bool>::iterator Pos
	= VTablesUsed.find(VTables[I].Record);
	// Even if a definition wasn't required before, it may be required now.
	if (Pos != VTablesUsed.end()) {
	if (!Pos->second && VTables[I].DefinitionRequired)
	Pos->second = true;
	continue;
	}

	VTablesUsed[VTables[I].Record] = VTables[I].DefinitionRequired;
	NewUses.push_back(VTableUse(VTables[I].Record, VTables[I].Location));
	}

	VTableUses.insert(VTableUses.begin(), NewUses.begin(), NewUses.end());
	}

	void Sema::MarkVTableUsed(SourceLocation Loc, CXXRecordDecl *Class,
	bool DefinitionRequired) {
	// Ignore any vtable uses in unevaluated operands or for classes that do
	// not have a vtable.
	if (!Class->isDynamicClass() \|\| Class->isDependentContext() \|\|
	CurContext->isDependentContext() \|\| isUnevaluatedContext())
	return;
	// Do not mark as used if compiling for the device outside of the target
	// region.
	if (LangOpts.OpenMP && LangOpts.OpenMPIsDevice &&
	!isInOpenMPDeclareTargetContext() &&
	!isInOpenMPTargetExecutionDirective()) {
	if (!DefinitionRequired)
	MarkVirtualMembersReferenced(Loc, Class);
	return;
	}

	// Try to insert this class into the map.
	LoadExternalVTableUses();
	Class = Class->getCanonicalDecl();
	std::pair<llvm::DenseMap<CXXRecordDecl *, bool>::iterator, bool>
	Pos = VTablesUsed.insert(std::make_pair(Class, DefinitionRequired));
	if (!Pos.second) {
	// If we already had an entry, check to see if we are promoting this vtable
	// to require a definition. If so, we need to reappend to the VTableUses
	// list, since we may have already processed the first entry.
	if (DefinitionRequired && !Pos.first->second) {
	Pos.first->second = true;
	} else {
	// Otherwise, we can early exit.
	return;
	}
	} else {
	// The Microsoft ABI requires that we perform the destructor body
	// checks (i.e. operator delete() lookup) when the vtable is marked used, as
	// the deleting destructor is emitted with the vtable, not with the
	// destructor definition as in the Itanium ABI.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft()) {
	CXXDestructorDecl *DD = Class->getDestructor();
	if (DD && DD->isVirtual() && !DD->isDeleted()) {
	if (Class->hasUserDeclaredDestructor() && !DD->isDefined()) {
	// If this is an out-of-line declaration, marking it referenced will
	// not do anything. Manually call CheckDestructor to look up operator
	// delete().
	ContextRAII SavedContext(*this, DD);
	CheckDestructor(DD);
	} else {
	MarkFunctionReferenced(Loc, Class->getDestructor());
	}
	}
	}
	}

	// Local classes need to have their virtual members marked
	// immediately. For all other classes, we mark their virtual members
	// at the end of the translation unit.
	if (Class->isLocalClass())
	MarkVirtualMembersReferenced(Loc, Class);
	else
	VTableUses.push_back(std::make_pair(Class, Loc));
	}

	bool Sema::DefineUsedVTables() {
	LoadExternalVTableUses();
	if (VTableUses.empty())
	return false;

	// Note: The VTableUses vector could grow as a result of marking
	// the members of a class as "used", so we check the size each
	// time through the loop and prefer indices (which are stable) to
	// iterators (which are not).
	bool DefinedAnything = false;
	for (unsigned I = 0; I != VTableUses.size(); ++I) {
	CXXRecordDecl *Class = VTableUses[I].first->getDefinition();
	if (!Class)
	continue;
	TemplateSpecializationKind ClassTSK =
	Class->getTemplateSpecializationKind();

	SourceLocation Loc = VTableUses[I].second;

	bool DefineVTable = true;

	// If this class has a key function, but that key function is
	// defined in another translation unit, we don't need to emit the
	// vtable even though we're using it.
	const CXXMethodDecl *KeyFunction = Context.getCurrentKeyFunction(Class);
	if (KeyFunction && !KeyFunction->hasBody()) {
	// The key function is in another translation unit.
	DefineVTable = false;
	TemplateSpecializationKind TSK =
	KeyFunction->getTemplateSpecializationKind();
	assert(TSK != TSK_ExplicitInstantiationDefinition &&
	TSK != TSK_ImplicitInstantiation &&
	"Instantiations don't have key functions");
	(void)TSK;
	} else if (!KeyFunction) {
	// If we have a class with no key function that is the subject
	// of an explicit instantiation declaration, suppress the
	// vtable; it will live with the explicit instantiation
	// definition.
	bool IsExplicitInstantiationDeclaration =
	ClassTSK == TSK_ExplicitInstantiationDeclaration;
	for (auto R : Class->redecls()) {
	TemplateSpecializationKind TSK
	= cast<CXXRecordDecl>(R)->getTemplateSpecializationKind();
	if (TSK == TSK_ExplicitInstantiationDeclaration)
	IsExplicitInstantiationDeclaration = true;
	else if (TSK == TSK_ExplicitInstantiationDefinition) {
	IsExplicitInstantiationDeclaration = false;
	break;
	}
	}

	if (IsExplicitInstantiationDeclaration)
	DefineVTable = false;
	}

	// The exception specifications for all virtual members may be needed even
	// if we are not providing an authoritative form of the vtable in this TU.
	// We may choose to emit it available_externally anyway.
	if (!DefineVTable) {
	MarkVirtualMemberExceptionSpecsNeeded(Loc, Class);
	continue;
	}

	// Mark all of the virtual members of this class as referenced, so
	// that we can build a vtable. Then, tell the AST consumer that a
	// vtable for this class is required.
	DefinedAnything = true;
	MarkVirtualMembersReferenced(Loc, Class);
	CXXRecordDecl *Canonical = Class->getCanonicalDecl();
	if (VTablesUsed[Canonical])
	Consumer.HandleVTable(Class);

	// Warn if we're emitting a weak vtable. The vtable will be weak if there is
	// no key function or the key function is inlined. Don't warn in C++ ABIs
	// that lack key functions, since the user won't be able to make one.
	if (Context.getTargetInfo().getCXXABI().hasKeyFunctions() &&
	Class->isExternallyVisible() && ClassTSK != TSK_ImplicitInstantiation) {
	const FunctionDecl *KeyFunctionDef = nullptr;
	if (!KeyFunction \|\| (KeyFunction->hasBody(KeyFunctionDef) &&
	KeyFunctionDef->isInlined())) {
	Diag(Class->getLocation(),
	ClassTSK == TSK_ExplicitInstantiationDefinition
	? diag::warn_weak_template_vtable
	: diag::warn_weak_vtable)
	<< Class;
	}
	}
	}
	VTableUses.clear();

	return DefinedAnything;
	}

	void Sema::MarkVirtualMemberExceptionSpecsNeeded(SourceLocation Loc,
	const CXXRecordDecl *RD) {
	for (const auto *I : RD->methods())
	if (I->isVirtual() && !I->isPure())
	ResolveExceptionSpec(Loc, I->getType()->castAs<FunctionProtoType>());
	}

	void Sema::MarkVirtualMembersReferenced(SourceLocation Loc,
	const CXXRecordDecl *RD,
	bool ConstexprOnly) {
	// Mark all functions which will appear in RD's vtable as used.
	CXXFinalOverriderMap FinalOverriders;
	RD->getFinalOverriders(FinalOverriders);
	for (CXXFinalOverriderMap::const_iterator I = FinalOverriders.begin(),
	E = FinalOverriders.end();
	I != E; ++I) {
	for (OverridingMethods::const_iterator OI = I->second.begin(),
	OE = I->second.end();
	OI != OE; ++OI) {
	assert(OI->second.size() > 0 && "no final overrider");
	CXXMethodDecl *Overrider = OI->second.front().Method;

	// C++ [basic.def.odr]p2:
	// [...] A virtual member function is used if it is not pure. [...]
	if (!Overrider->isPure() && (!ConstexprOnly \|\| Overrider->isConstexpr()))
	MarkFunctionReferenced(Loc, Overrider);
	}
	}

	// Only classes that have virtual bases need a VTT.
	if (RD->getNumVBases() == 0)
	return;

	for (const auto &I : RD->bases()) {
	const CXXRecordDecl *Base =
	cast<CXXRecordDecl>(I.getType()->getAs<RecordType>()->getDecl());
	if (Base->getNumVBases() == 0)
	continue;
	MarkVirtualMembersReferenced(Loc, Base);
	}
	}

	/// SetIvarInitializers - This routine builds initialization ASTs for the
	/// Objective-C implementation whose ivars need be initialized.
	void Sema::SetIvarInitializers(ObjCImplementationDecl *ObjCImplementation) {
	if (!getLangOpts().CPlusPlus)
	return;
	if (ObjCInterfaceDecl *OID = ObjCImplementation->getClassInterface()) {
	SmallVector<ObjCIvarDecl*, 8> ivars;
	CollectIvarsToConstructOrDestruct(OID, ivars);
	if (ivars.empty())
	return;
	SmallVector<CXXCtorInitializer*, 32> AllToInit;
	for (unsigned i = 0; i < ivars.size(); i++) {
	FieldDecl *Field = ivars[i];
	if (Field->isInvalidDecl())
	continue;

	CXXCtorInitializer *Member;
	InitializedEntity InitEntity = InitializedEntity::InitializeMember(Field);
	InitializationKind InitKind =
	InitializationKind::CreateDefault(ObjCImplementation->getLocation());

	InitializationSequence InitSeq(*this, InitEntity, InitKind, None);
	ExprResult MemberInit =
	InitSeq.Perform(*this, InitEntity, InitKind, None);
	MemberInit = MaybeCreateExprWithCleanups(MemberInit);
	// Note, MemberInit could actually come back empty if no initialization
	// is required (e.g., because it would call a trivial default constructor)
	if (!MemberInit.get() \|\| MemberInit.isInvalid())
	continue;

	Member =
	new (Context) CXXCtorInitializer(Context, Field, SourceLocation(),
	SourceLocation(),
	MemberInit.getAs<Expr>(),
	SourceLocation());
	AllToInit.push_back(Member);

	// Be sure that the destructor is accessible and is marked as referenced.
	if (const RecordType *RecordTy =
	Context.getBaseElementType(Field->getType())
	->getAs<RecordType>()) {
	CXXRecordDecl *RD = cast<CXXRecordDecl>(RecordTy->getDecl());
	if (CXXDestructorDecl *Destructor = LookupDestructor(RD)) {
	MarkFunctionReferenced(Field->getLocation(), Destructor);
	CheckDestructorAccess(Field->getLocation(), Destructor,
	PDiag(diag::err_access_dtor_ivar)
	<< Context.getBaseElementType(Field->getType()));
	}
	}
	}
	ObjCImplementation->setIvarInitializers(Context,
	AllToInit.data(), AllToInit.size());
	}
	}

	static
	void DelegatingCycleHelper(CXXConstructorDecl* Ctor,
	llvm::SmallPtrSet<CXXConstructorDecl*, 4> &Valid,
	llvm::SmallPtrSet<CXXConstructorDecl*, 4> &Invalid,
	llvm::SmallPtrSet<CXXConstructorDecl*, 4> &Current,
	Sema &S) {
	if (Ctor->isInvalidDecl())
	return;

	CXXConstructorDecl *Target = Ctor->getTargetConstructor();

	// Target may not be determinable yet, for instance if this is a dependent
	// call in an uninstantiated template.
	if (Target) {
	const FunctionDecl *FNTarget = nullptr;
	(void)Target->hasBody(FNTarget);
	Target = const_cast<CXXConstructorDecl*>(
	cast_or_null<CXXConstructorDecl>(FNTarget));
	}

	CXXConstructorDecl *Canonical = Ctor->getCanonicalDecl(),
	// Avoid dereferencing a null pointer here.
	*TCanonical = Target? Target->getCanonicalDecl() : nullptr;

	if (!Current.insert(Canonical).second)
	return;

	// We know that beyond here, we aren't chaining into a cycle.
	if (!Target \|\| !Target->isDelegatingConstructor() \|\|
	Target->isInvalidDecl() \|\| Valid.count(TCanonical)) {
	Valid.insert(Current.begin(), Current.end());
	Current.clear();
	// We've hit a cycle.
	} else if (TCanonical == Canonical \|\| Invalid.count(TCanonical) \|\|
	Current.count(TCanonical)) {
	// If we haven't diagnosed this cycle yet, do so now.
	if (!Invalid.count(TCanonical)) {
	S.Diag((*Ctor->init_begin())->getSourceLocation(),
	diag::warn_delegating_ctor_cycle)
	<< Ctor;

	// Don't add a note for a function delegating directly to itself.
	if (TCanonical != Canonical)
	S.Diag(Target->getLocation(), diag::note_it_delegates_to);

	CXXConstructorDecl *C = Target;
	while (C->getCanonicalDecl() != Canonical) {
	const FunctionDecl *FNTarget = nullptr;
	(void)C->getTargetConstructor()->hasBody(FNTarget);
	assert(FNTarget && "Ctor cycle through bodiless function");

	C = const_cast<CXXConstructorDecl*>(
	cast<CXXConstructorDecl>(FNTarget));
	S.Diag(C->getLocation(), diag::note_which_delegates_to);
	}
	}

	Invalid.insert(Current.begin(), Current.end());
	Current.clear();
	} else {
	DelegatingCycleHelper(Target, Valid, Invalid, Current, S);
	}
	}


	void Sema::CheckDelegatingCtorCycles() {
	llvm::SmallPtrSet<CXXConstructorDecl*, 4> Valid, Invalid, Current;

	for (DelegatingCtorDeclsType::iterator
	I = DelegatingCtorDecls.begin(ExternalSource),
	E = DelegatingCtorDecls.end();
	I != E; ++I)
	DelegatingCycleHelper(I, Valid, Invalid, Current, this);

	for (auto CI = Invalid.begin(), CE = Invalid.end(); CI != CE; ++CI)
	(*CI)->setInvalidDecl();
	}

	namespace {
	/// AST visitor that finds references to the 'this' expression.
	class FindCXXThisExpr : public RecursiveASTVisitor<FindCXXThisExpr> {
	Sema &S;

	public:
	explicit FindCXXThisExpr(Sema &S) : S(S) { }

	bool VisitCXXThisExpr(CXXThisExpr *E) {
	S.Diag(E->getLocation(), diag::err_this_static_member_func)
	<< E->isImplicit();
	return false;
	}
	};
	}

	bool Sema::checkThisInStaticMemberFunctionType(CXXMethodDecl *Method) {
	TypeSourceInfo *TSInfo = Method->getTypeSourceInfo();
	if (!TSInfo)
	return false;

	TypeLoc TL = TSInfo->getTypeLoc();
	FunctionProtoTypeLoc ProtoTL = TL.getAs<FunctionProtoTypeLoc>();
	if (!ProtoTL)
	return false;

	// C++11 [expr.prim.general]p3:
	// [The expression this] shall not appear before the optional
	// cv-qualifier-seq and it shall not appear within the declaration of a
	// static member function (although its type and value category are defined
	// within a static member function as they are within a non-static member
	// function). [ Note: this is because declaration matching does not occur
	// until the complete declarator is known. - end note ]
	const FunctionProtoType *Proto = ProtoTL.getTypePtr();
	FindCXXThisExpr Finder(*this);

	// If the return type came after the cv-qualifier-seq, check it now.
	if (Proto->hasTrailingReturn() &&
	!Finder.TraverseTypeLoc(ProtoTL.getReturnLoc()))
	return true;

	// Check the exception specification.
	if (checkThisInStaticMemberFunctionExceptionSpec(Method))
	return true;

	return checkThisInStaticMemberFunctionAttributes(Method);
	}

	bool Sema::checkThisInStaticMemberFunctionExceptionSpec(CXXMethodDecl *Method) {
	TypeSourceInfo *TSInfo = Method->getTypeSourceInfo();
	if (!TSInfo)
	return false;

	TypeLoc TL = TSInfo->getTypeLoc();
	FunctionProtoTypeLoc ProtoTL = TL.getAs<FunctionProtoTypeLoc>();
	if (!ProtoTL)
	return false;

	const FunctionProtoType *Proto = ProtoTL.getTypePtr();
	FindCXXThisExpr Finder(*this);

	switch (Proto->getExceptionSpecType()) {
	case EST_Unparsed:
	case EST_Uninstantiated:
	case EST_Unevaluated:
	case EST_BasicNoexcept:
	case EST_NoThrow:
	case EST_DynamicNone:
	case EST_MSAny:
	case EST_None:
	break;

	case EST_DependentNoexcept:
	case EST_NoexceptFalse:
	case EST_NoexceptTrue:
	if (!Finder.TraverseStmt(Proto->getNoexceptExpr()))
	return true;
	LLVM_FALLTHROUGH;

	case EST_Dynamic:
	for (const auto &E : Proto->exceptions()) {
	if (!Finder.TraverseType(E))
	return true;
	}
	break;
	}

	return false;
	}

	bool Sema::checkThisInStaticMemberFunctionAttributes(CXXMethodDecl *Method) {
	FindCXXThisExpr Finder(*this);

	// Check attributes.
	for (const auto *A : Method->attrs()) {
	// FIXME: This should be emitted by tblgen.
	Expr *Arg = nullptr;
	ArrayRef<Expr *> Args;
	if (const auto *G = dyn_cast<GuardedByAttr>(A))
	Arg = G->getArg();
	else if (const auto *G = dyn_cast<PtGuardedByAttr>(A))
	Arg = G->getArg();
	else if (const auto *AA = dyn_cast<AcquiredAfterAttr>(A))
	Args = llvm::makeArrayRef(AA->args_begin(), AA->args_size());
	else if (const auto *AB = dyn_cast<AcquiredBeforeAttr>(A))
	Args = llvm::makeArrayRef(AB->args_begin(), AB->args_size());
	else if (const auto *ETLF = dyn_cast<ExclusiveTrylockFunctionAttr>(A)) {
	Arg = ETLF->getSuccessValue();
	Args = llvm::makeArrayRef(ETLF->args_begin(), ETLF->args_size());
	} else if (const auto *STLF = dyn_cast<SharedTrylockFunctionAttr>(A)) {
	Arg = STLF->getSuccessValue();
	Args = llvm::makeArrayRef(STLF->args_begin(), STLF->args_size());
	} else if (const auto *LR = dyn_cast<LockReturnedAttr>(A))
	Arg = LR->getArg();
	else if (const auto *LE = dyn_cast<LocksExcludedAttr>(A))
	Args = llvm::makeArrayRef(LE->args_begin(), LE->args_size());
	else if (const auto *RC = dyn_cast<RequiresCapabilityAttr>(A))
	Args = llvm::makeArrayRef(RC->args_begin(), RC->args_size());
	else if (const auto *AC = dyn_cast<AcquireCapabilityAttr>(A))
	Args = llvm::makeArrayRef(AC->args_begin(), AC->args_size());
	else if (const auto *AC = dyn_cast<TryAcquireCapabilityAttr>(A))
	Args = llvm::makeArrayRef(AC->args_begin(), AC->args_size());
	else if (const auto *RC = dyn_cast<ReleaseCapabilityAttr>(A))
	Args = llvm::makeArrayRef(RC->args_begin(), RC->args_size());

	if (Arg && !Finder.TraverseStmt(Arg))
	return true;

	for (unsigned I = 0, N = Args.size(); I != N; ++I) {
	if (!Finder.TraverseStmt(Args[I]))
	return true;
	}
	}

	return false;
	}

	void Sema::checkExceptionSpecification(
	bool IsTopLevel, ExceptionSpecificationType EST,
	ArrayRef<ParsedType> DynamicExceptions,
	ArrayRef<SourceRange> DynamicExceptionRanges, Expr *NoexceptExpr,
	SmallVectorImpl<QualType> &Exceptions,
	FunctionProtoType::ExceptionSpecInfo &ESI) {
	Exceptions.clear();
	ESI.Type = EST;
	if (EST == EST_Dynamic) {
	Exceptions.reserve(DynamicExceptions.size());
	for (unsigned ei = 0, ee = DynamicExceptions.size(); ei != ee; ++ei) {
	// FIXME: Preserve type source info.
	QualType ET = GetTypeFromParser(DynamicExceptions[ei]);

	if (IsTopLevel) {
	SmallVector<UnexpandedParameterPack, 2> Unexpanded;
	collectUnexpandedParameterPacks(ET, Unexpanded);
	if (!Unexpanded.empty()) {
	DiagnoseUnexpandedParameterPacks(
	DynamicExceptionRanges[ei].getBegin(), UPPC_ExceptionType,
	Unexpanded);
	continue;
	}
	}

	// Check that the type is valid for an exception spec, and
	// drop it if not.
	if (!CheckSpecifiedExceptionType(ET, DynamicExceptionRanges[ei]))
	Exceptions.push_back(ET);
	}
	ESI.Exceptions = Exceptions;
	return;
	}

	if (isComputedNoexcept(EST)) {
	assert((NoexceptExpr->isTypeDependent() \|\|
	NoexceptExpr->getType()->getCanonicalTypeUnqualified() ==
	Context.BoolTy) &&
	"Parser should have made sure that the expression is boolean");
	if (IsTopLevel && DiagnoseUnexpandedParameterPack(NoexceptExpr)) {
	ESI.Type = EST_BasicNoexcept;
	return;
	}

	ESI.NoexceptExpr = NoexceptExpr;
	return;
	}
	}

	void Sema::actOnDelayedExceptionSpecification(Decl *MethodD,
	ExceptionSpecificationType EST,
	SourceRange SpecificationRange,
	ArrayRef<ParsedType> DynamicExceptions,
	ArrayRef<SourceRange> DynamicExceptionRanges,
	Expr *NoexceptExpr) {
	if (!MethodD)
	return;

	// Dig out the method we're referring to.
	if (FunctionTemplateDecl *FunTmpl = dyn_cast<FunctionTemplateDecl>(MethodD))
	MethodD = FunTmpl->getTemplatedDecl();

	CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(MethodD);
	if (!Method)
	return;

	// Check the exception specification.
	llvm::SmallVector<QualType, 4> Exceptions;
	FunctionProtoType::ExceptionSpecInfo ESI;
	checkExceptionSpecification(/IsTopLevel/true, EST, DynamicExceptions,
	DynamicExceptionRanges, NoexceptExpr, Exceptions,
	ESI);

	// Update the exception specification on the function type.
	Context.adjustExceptionSpec(Method, ESI, /AsWritten/true);

	if (Method->isStatic())
	checkThisInStaticMemberFunctionExceptionSpec(Method);

	if (Method->isVirtual()) {
	// Check overrides, which we previously had to delay.
	for (const CXXMethodDecl *O : Method->overridden_methods())
	CheckOverridingFunctionExceptionSpec(Method, O);
	}
	}

	/// HandleMSProperty - Analyze a __delcspec(property) field of a C++ class.
	///
	MSPropertyDecl Sema::HandleMSProperty(Scope S, RecordDecl *Record,
	SourceLocation DeclStart, Declarator &D,
	Expr *BitWidth,
	InClassInitStyle InitStyle,
	AccessSpecifier AS,
	const ParsedAttr &MSPropertyAttr) {
	IdentifierInfo *II = D.getIdentifier();
	if (!II) {
	Diag(DeclStart, diag::err_anonymous_property);
	return nullptr;
	}
	SourceLocation Loc = D.getIdentifierLoc();

	TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
	QualType T = TInfo->getType();
	if (getLangOpts().CPlusPlus) {
	CheckExtraCXXDefaultArguments(D);

	if (DiagnoseUnexpandedParameterPack(D.getIdentifierLoc(), TInfo,
	UPPC_DataMemberType)) {
	D.setInvalidType();
	T = Context.IntTy;
	TInfo = Context.getTrivialTypeSourceInfo(T, Loc);
	}
	}

	DiagnoseFunctionSpecifiers(D.getDeclSpec());

	if (D.getDeclSpec().isInlineSpecified())
	Diag(D.getDeclSpec().getInlineSpecLoc(), diag::err_inline_non_function)
	<< getLangOpts().CPlusPlus17;
	if (DeclSpec::TSCS TSCS = D.getDeclSpec().getThreadStorageClassSpec())
	Diag(D.getDeclSpec().getThreadStorageClassSpecLoc(),
	diag::err_invalid_thread)
	<< DeclSpec::getSpecifierName(TSCS);

	// Check to see if this name was declared as a member previously
	NamedDecl *PrevDecl = nullptr;
	LookupResult Previous(*this, II, Loc, LookupMemberName,
	ForVisibleRedeclaration);
	LookupName(Previous, S);
	switch (Previous.getResultKind()) {
	case LookupResult::Found:
	case LookupResult::FoundUnresolvedValue:
	PrevDecl = Previous.getAsSingle<NamedDecl>();
	break;

	case LookupResult::FoundOverloaded:
	PrevDecl = Previous.getRepresentativeDecl();
	break;

	case LookupResult::NotFound:
	case LookupResult::NotFoundInCurrentInstantiation:
	case LookupResult::Ambiguous:
	break;
	}

	if (PrevDecl && PrevDecl->isTemplateParameter()) {
	// Maybe we will complain about the shadowed template parameter.
	DiagnoseTemplateParameterShadow(D.getIdentifierLoc(), PrevDecl);
	// Just pretend that we didn't see the previous declaration.
	PrevDecl = nullptr;
	}

	if (PrevDecl && !isDeclInScope(PrevDecl, Record, S))
	PrevDecl = nullptr;

	SourceLocation TSSL = D.getBeginLoc();
	MSPropertyDecl *NewPD =
	MSPropertyDecl::Create(Context, Record, Loc, II, T, TInfo, TSSL,
	MSPropertyAttr.getPropertyDataGetter(),
	MSPropertyAttr.getPropertyDataSetter());
	ProcessDeclAttributes(TUScope, NewPD, D);
	NewPD->setAccess(AS);

	if (NewPD->isInvalidDecl())
	Record->setInvalidDecl();

	if (D.getDeclSpec().isModulePrivateSpecified())
	NewPD->setModulePrivate();

	if (NewPD->isInvalidDecl() && PrevDecl) {
	// Don't introduce NewFD into scope; there's already something
	// with the same name in the same scope.
	} else if (II) {
	PushOnScopeChains(NewPD, S);
	} else
	Record->addDecl(NewPD);

	return NewPD;
	}
	Index: projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaTemplate.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaTemplate.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaTemplate.cpp (revision 351722)
	@@ -1,10476 +1,10477 @@
	//===------- SemaTemplate.cpp - Semantic Analysis for C++ Templates -------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//===----------------------------------------------------------------------===//
	//
	// This file implements semantic analysis for C++ templates.
	//===----------------------------------------------------------------------===//

	#include "TreeTransform.h"
	#include "clang/AST/ASTConsumer.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/DeclFriend.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/RecursiveASTVisitor.h"
	#include "clang/AST/TypeVisitor.h"
	#include "clang/Basic/Builtins.h"
	#include "clang/Basic/LangOptions.h"
	#include "clang/Basic/PartialDiagnostic.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Sema/DeclSpec.h"
	#include "clang/Sema/Lookup.h"
	#include "clang/Sema/ParsedTemplate.h"
	#include "clang/Sema/Scope.h"
	#include "clang/Sema/SemaInternal.h"
	#include "clang/Sema/Template.h"
	#include "clang/Sema/TemplateDeduction.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/StringExtras.h"

	#include <iterator>
	using namespace clang;
	using namespace sema;

	// Exported for use by Parser.
	SourceRange
	clang::getTemplateParamsRange(TemplateParameterList const * const *Ps,
	unsigned N) {
	if (!N) return SourceRange();
	return SourceRange(Ps[0]->getTemplateLoc(), Ps[N-1]->getRAngleLoc());
	}

	namespace clang {
	/// [temp.constr.decl]p2: A template's associated constraints are
	/// defined as a single constraint-expression derived from the introduced
	/// constraint-expressions [ ... ].
	///
	/// \param Params The template parameter list and optional requires-clause.
	///
	/// \param FD The underlying templated function declaration for a function
	/// template.
	static Expr formAssociatedConstraints(TemplateParameterList Params,
	FunctionDecl *FD);
	}

	static Expr clang::formAssociatedConstraints(TemplateParameterList Params,
	FunctionDecl *FD) {
	// FIXME: Concepts: collect additional introduced constraint-expressions
	assert(!FD && "Cannot collect constraints from function declaration yet.");
	return Params->getRequiresClause();
	}

	/// Determine whether the declaration found is acceptable as the name
	/// of a template and, if so, return that template declaration. Otherwise,
	/// returns null.
	///
	/// Note that this may return an UnresolvedUsingValueDecl if AllowDependent
	/// is true. In all other cases it will return a TemplateDecl (or null).
	NamedDecl Sema::getAsTemplateNameDecl(NamedDecl D,
	bool AllowFunctionTemplates,
	bool AllowDependent) {
	D = D->getUnderlyingDecl();

	if (isa<TemplateDecl>(D)) {
	if (!AllowFunctionTemplates && isa<FunctionTemplateDecl>(D))
	return nullptr;

	return D;
	}

	if (CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(D)) {
	// C++ [temp.local]p1:
	// Like normal (non-template) classes, class templates have an
	// injected-class-name (Clause 9). The injected-class-name
	// can be used with or without a template-argument-list. When
	// it is used without a template-argument-list, it is
	// equivalent to the injected-class-name followed by the
	// template-parameters of the class template enclosed in
	// <>. When it is used with a template-argument-list, it
	// refers to the specified class template specialization,
	// which could be the current specialization or another
	// specialization.
	if (Record->isInjectedClassName()) {
	Record = cast<CXXRecordDecl>(Record->getDeclContext());
	if (Record->getDescribedClassTemplate())
	return Record->getDescribedClassTemplate();

	if (ClassTemplateSpecializationDecl *Spec
	= dyn_cast<ClassTemplateSpecializationDecl>(Record))
	return Spec->getSpecializedTemplate();
	}

	return nullptr;
	}

	// 'using Dependent::foo;' can resolve to a template name.
	// 'using typename Dependent::foo;' cannot (not even if 'foo' is an
	// injected-class-name).
	if (AllowDependent && isa<UnresolvedUsingValueDecl>(D))
	return D;

	return nullptr;
	}

	void Sema::FilterAcceptableTemplateNames(LookupResult &R,
	bool AllowFunctionTemplates,
	bool AllowDependent) {
	LookupResult::Filter filter = R.makeFilter();
	while (filter.hasNext()) {
	NamedDecl *Orig = filter.next();
	if (!getAsTemplateNameDecl(Orig, AllowFunctionTemplates, AllowDependent))
	filter.erase();
	}
	filter.done();
	}

	bool Sema::hasAnyAcceptableTemplateNames(LookupResult &R,
	bool AllowFunctionTemplates,
	bool AllowDependent,
	bool AllowNonTemplateFunctions) {
	for (LookupResult::iterator I = R.begin(), IEnd = R.end(); I != IEnd; ++I) {
	if (getAsTemplateNameDecl(*I, AllowFunctionTemplates, AllowDependent))
	return true;
	if (AllowNonTemplateFunctions &&
	isa<FunctionDecl>((*I)->getUnderlyingDecl()))
	return true;
	}

	return false;
	}

	TemplateNameKind Sema::isTemplateName(Scope *S,
	CXXScopeSpec &SS,
	bool hasTemplateKeyword,
	const UnqualifiedId &Name,
	ParsedType ObjectTypePtr,
	bool EnteringContext,
	TemplateTy &TemplateResult,
	bool &MemberOfUnknownSpecialization) {
	assert(getLangOpts().CPlusPlus && "No template names in C!");

	DeclarationName TName;
	MemberOfUnknownSpecialization = false;

	switch (Name.getKind()) {
	case UnqualifiedIdKind::IK_Identifier:
	TName = DeclarationName(Name.Identifier);
	break;

	case UnqualifiedIdKind::IK_OperatorFunctionId:
	TName = Context.DeclarationNames.getCXXOperatorName(
	Name.OperatorFunctionId.Operator);
	break;

	case UnqualifiedIdKind::IK_LiteralOperatorId:
	TName = Context.DeclarationNames.getCXXLiteralOperatorName(Name.Identifier);
	break;

	default:
	return TNK_Non_template;
	}

	QualType ObjectType = ObjectTypePtr.get();

	AssumedTemplateKind AssumedTemplate;
	LookupResult R(*this, TName, Name.getBeginLoc(), LookupOrdinaryName);
	if (LookupTemplateName(R, S, SS, ObjectType, EnteringContext,
	MemberOfUnknownSpecialization, SourceLocation(),
	&AssumedTemplate))
	return TNK_Non_template;

	if (AssumedTemplate != AssumedTemplateKind::None) {
	TemplateResult = TemplateTy::make(Context.getAssumedTemplateName(TName));
	// Let the parser know whether we found nothing or found functions; if we
	// found nothing, we want to more carefully check whether this is actually
	// a function template name versus some other kind of undeclared identifier.
	return AssumedTemplate == AssumedTemplateKind::FoundNothing
	? TNK_Undeclared_template
	: TNK_Function_template;
	}

	if (R.empty())
	return TNK_Non_template;

	NamedDecl *D = nullptr;
	if (R.isAmbiguous()) {
	// If we got an ambiguity involving a non-function template, treat this
	// as a template name, and pick an arbitrary template for error recovery.
	bool AnyFunctionTemplates = false;
	for (NamedDecl *FoundD : R) {
	if (NamedDecl *FoundTemplate = getAsTemplateNameDecl(FoundD)) {
	if (isa<FunctionTemplateDecl>(FoundTemplate))
	AnyFunctionTemplates = true;
	else {
	D = FoundTemplate;
	break;
	}
	}
	}

	// If we didn't find any templates at all, this isn't a template name.
	// Leave the ambiguity for a later lookup to diagnose.
	if (!D && !AnyFunctionTemplates) {
	R.suppressDiagnostics();
	return TNK_Non_template;
	}

	// If the only templates were function templates, filter out the rest.
	// We'll diagnose the ambiguity later.
	if (!D)
	FilterAcceptableTemplateNames(R);
	}

	// At this point, we have either picked a single template name declaration D
	// or we have a non-empty set of results R containing either one template name
	// declaration or a set of function templates.

	TemplateName Template;
	TemplateNameKind TemplateKind;

	unsigned ResultCount = R.end() - R.begin();
	if (!D && ResultCount > 1) {
	// We assume that we'll preserve the qualifier from a function
	// template name in other ways.
	Template = Context.getOverloadedTemplateName(R.begin(), R.end());
	TemplateKind = TNK_Function_template;

	// We'll do this lookup again later.
	R.suppressDiagnostics();
	} else {
	if (!D) {
	D = getAsTemplateNameDecl(*R.begin());
	assert(D && "unambiguous result is not a template name");
	}

	if (isa<UnresolvedUsingValueDecl>(D)) {
	// We don't yet know whether this is a template-name or not.
	MemberOfUnknownSpecialization = true;
	return TNK_Non_template;
	}

	TemplateDecl *TD = cast<TemplateDecl>(D);

	if (SS.isSet() && !SS.isInvalid()) {
	NestedNameSpecifier *Qualifier = SS.getScopeRep();
	Template = Context.getQualifiedTemplateName(Qualifier,
	hasTemplateKeyword, TD);
	} else {
	Template = TemplateName(TD);
	}

	if (isa<FunctionTemplateDecl>(TD)) {
	TemplateKind = TNK_Function_template;

	// We'll do this lookup again later.
	R.suppressDiagnostics();
	} else {
	assert(isa<ClassTemplateDecl>(TD) \|\| isa<TemplateTemplateParmDecl>(TD) \|\|
	isa<TypeAliasTemplateDecl>(TD) \|\| isa<VarTemplateDecl>(TD) \|\|
	isa<BuiltinTemplateDecl>(TD) \|\| isa<ConceptDecl>(TD));
	TemplateKind =
	isa<VarTemplateDecl>(TD) ? TNK_Var_template :
	isa<ConceptDecl>(TD) ? TNK_Concept_template :
	TNK_Type_template;
	}
	}

	TemplateResult = TemplateTy::make(Template);
	return TemplateKind;
	}

	bool Sema::isDeductionGuideName(Scope *S, const IdentifierInfo &Name,
	SourceLocation NameLoc,
	ParsedTemplateTy *Template) {
	CXXScopeSpec SS;
	bool MemberOfUnknownSpecialization = false;

	// We could use redeclaration lookup here, but we don't need to: the
	// syntactic form of a deduction guide is enough to identify it even
	// if we can't look up the template name at all.
	LookupResult R(*this, DeclarationName(&Name), NameLoc, LookupOrdinaryName);
	if (LookupTemplateName(R, S, SS, /ObjectType/ QualType(),
	/EnteringContext/ false,
	MemberOfUnknownSpecialization))
	return false;

	if (R.empty()) return false;
	if (R.isAmbiguous()) {
	// FIXME: Diagnose an ambiguity if we find at least one template.
	R.suppressDiagnostics();
	return false;
	}

	// We only treat template-names that name type templates as valid deduction
	// guide names.
	TemplateDecl *TD = R.getAsSingle<TemplateDecl>();
	if (!TD \|\| !getAsTypeTemplateDecl(TD))
	return false;

	if (Template)
	*Template = TemplateTy::make(TemplateName(TD));
	return true;
	}

	bool Sema::DiagnoseUnknownTemplateName(const IdentifierInfo &II,
	SourceLocation IILoc,
	Scope *S,
	const CXXScopeSpec *SS,
	TemplateTy &SuggestedTemplate,
	TemplateNameKind &SuggestedKind) {
	// We can't recover unless there's a dependent scope specifier preceding the
	// template name.
	// FIXME: Typo correction?
	if (!SS \|\| !SS->isSet() \|\| !isDependentScopeSpecifier(*SS) \|\|
	computeDeclContext(*SS))
	return false;

	// The code is missing a 'template' keyword prior to the dependent template
	// name.
	NestedNameSpecifier Qualifier = (NestedNameSpecifier)SS->getScopeRep();
	Diag(IILoc, diag::err_template_kw_missing)
	<< Qualifier << II.getName()
	<< FixItHint::CreateInsertion(IILoc, "template ");
	SuggestedTemplate
	= TemplateTy::make(Context.getDependentTemplateName(Qualifier, &II));
	SuggestedKind = TNK_Dependent_template_name;
	return true;
	}

	bool Sema::LookupTemplateName(LookupResult &Found,
	Scope *S, CXXScopeSpec &SS,
	QualType ObjectType,
	bool EnteringContext,
	bool &MemberOfUnknownSpecialization,
	SourceLocation TemplateKWLoc,
	AssumedTemplateKind *ATK) {
	if (ATK)
	*ATK = AssumedTemplateKind::None;

	Found.setTemplateNameLookup(true);

	// Determine where to perform name lookup
	MemberOfUnknownSpecialization = false;
	DeclContext *LookupCtx = nullptr;
	bool IsDependent = false;
	if (!ObjectType.isNull()) {
	// This nested-name-specifier occurs in a member access expression, e.g.,
	// x->B::f, and we are looking into the type of the object.
	assert(!SS.isSet() && "ObjectType and scope specifier cannot coexist");
	LookupCtx = computeDeclContext(ObjectType);
	IsDependent = !LookupCtx && ObjectType->isDependentType();
	assert((IsDependent \|\| !ObjectType->isIncompleteType() \|\|
	ObjectType->castAs<TagType>()->isBeingDefined()) &&
	"Caller should have completed object type");

	// Template names cannot appear inside an Objective-C class or object type
	// or a vector type.
	//
	// FIXME: This is wrong. For example:
	//
	// template<typename T> using Vec = T __attribute__((ext_vector_type(4)));
	// Vec<int> vi;
	// vi.Vec<int>::~Vec<int>();
	//
	// ... should be accepted but we will not treat 'Vec' as a template name
	// here. The right thing to do would be to check if the name is a valid
	// vector component name, and look up a template name if not. And similarly
	// for lookups into Objective-C class and object types, where the same
	// problem can arise.
	if (ObjectType->isObjCObjectOrInterfaceType() \|\|
	ObjectType->isVectorType()) {
	Found.clear();
	return false;
	}
	} else if (SS.isSet()) {
	// This nested-name-specifier occurs after another nested-name-specifier,
	// so long into the context associated with the prior nested-name-specifier.
	LookupCtx = computeDeclContext(SS, EnteringContext);
	IsDependent = !LookupCtx;

	// The declaration context must be complete.
	if (LookupCtx && RequireCompleteDeclContext(SS, LookupCtx))
	return true;
	}

	bool ObjectTypeSearchedInScope = false;
	bool AllowFunctionTemplatesInLookup = true;
	if (LookupCtx) {
	// Perform "qualified" name lookup into the declaration context we
	// computed, which is either the type of the base of a member access
	// expression or the declaration context associated with a prior
	// nested-name-specifier.
	LookupQualifiedName(Found, LookupCtx);

	// FIXME: The C++ standard does not clearly specify what happens in the
	// case where the object type is dependent, and implementations vary. In
	// Clang, we treat a name after a . or -> as a template-name if lookup
	// finds a non-dependent member or member of the current instantiation that
	// is a type template, or finds no such members and lookup in the context
	// of the postfix-expression finds a type template. In the latter case, the
	// name is nonetheless dependent, and we may resolve it to a member of an
	// unknown specialization when we come to instantiate the template.
	IsDependent \|= Found.wasNotFoundInCurrentInstantiation();
	}

	if (!SS.isSet() && (ObjectType.isNull() \|\| Found.empty())) {
	// C++ [basic.lookup.classref]p1:
	// In a class member access expression (5.2.5), if the . or -> token is
	// immediately followed by an identifier followed by a <, the
	// identifier must be looked up to determine whether the < is the
	// beginning of a template argument list (14.2) or a less-than operator.
	// The identifier is first looked up in the class of the object
	// expression. If the identifier is not found, it is then looked up in
	// the context of the entire postfix-expression and shall name a class
	// template.
	if (S)
	LookupName(Found, S);

	if (!ObjectType.isNull()) {
	// FIXME: We should filter out all non-type templates here, particularly
	// variable templates and concepts. But the exclusion of alias templates
	// and template template parameters is a wording defect.
	AllowFunctionTemplatesInLookup = false;
	ObjectTypeSearchedInScope = true;
	}

	IsDependent \|= Found.wasNotFoundInCurrentInstantiation();
	}

	if (Found.isAmbiguous())
	return false;

	if (ATK && !SS.isSet() && ObjectType.isNull() && TemplateKWLoc.isInvalid()) {
	// C++2a [temp.names]p2:
	// A name is also considered to refer to a template if it is an
	// unqualified-id followed by a < and name lookup finds either one or more
	// functions or finds nothing.
	//
	// To keep our behavior consistent, we apply the "finds nothing" part in
	// all language modes, and diagnose the empty lookup in ActOnCallExpr if we
	// successfully form a call to an undeclared template-id.
	bool AllFunctions =
	getLangOpts().CPlusPlus2a &&
	std::all_of(Found.begin(), Found.end(), [](NamedDecl *ND) {
	return isa<FunctionDecl>(ND->getUnderlyingDecl());
	});
	if (AllFunctions \|\| (Found.empty() && !IsDependent)) {
	// If lookup found any functions, or if this is a name that can only be
	// used for a function, then strongly assume this is a function
	// template-id.
	*ATK = (Found.empty() && Found.getLookupName().isIdentifier())
	? AssumedTemplateKind::FoundNothing
	: AssumedTemplateKind::FoundFunctions;
	Found.clear();
	return false;
	}
	}

	if (Found.empty() && !IsDependent) {
	// If we did not find any names, attempt to correct any typos.
	DeclarationName Name = Found.getLookupName();
	Found.clear();
	// Simple filter callback that, for keywords, only accepts the C++ *_cast
	DefaultFilterCCC FilterCCC{};
	FilterCCC.WantTypeSpecifiers = false;
	FilterCCC.WantExpressionKeywords = false;
	FilterCCC.WantRemainingKeywords = false;
	FilterCCC.WantCXXNamedCasts = true;
	if (TypoCorrection Corrected =
	CorrectTypo(Found.getLookupNameInfo(), Found.getLookupKind(), S,
	&SS, FilterCCC, CTK_ErrorRecovery, LookupCtx)) {
	if (auto *ND = Corrected.getFoundDecl())
	Found.addDecl(ND);
	FilterAcceptableTemplateNames(Found);
	if (Found.isAmbiguous()) {
	Found.clear();
	} else if (!Found.empty()) {
	Found.setLookupName(Corrected.getCorrection());
	if (LookupCtx) {
	std::string CorrectedStr(Corrected.getAsString(getLangOpts()));
	bool DroppedSpecifier = Corrected.WillReplaceSpecifier() &&
	Name.getAsString() == CorrectedStr;
	diagnoseTypo(Corrected, PDiag(diag::err_no_member_template_suggest)
	<< Name << LookupCtx << DroppedSpecifier
	<< SS.getRange());
	} else {
	diagnoseTypo(Corrected, PDiag(diag::err_no_template_suggest) << Name);
	}
	}
	}
	}

	NamedDecl *ExampleLookupResult =
	Found.empty() ? nullptr : Found.getRepresentativeDecl();
	FilterAcceptableTemplateNames(Found, AllowFunctionTemplatesInLookup);
	if (Found.empty()) {
	if (IsDependent) {
	MemberOfUnknownSpecialization = true;
	return false;
	}

	// If a 'template' keyword was used, a lookup that finds only non-template
	// names is an error.
	if (ExampleLookupResult && TemplateKWLoc.isValid()) {
	Diag(Found.getNameLoc(), diag::err_template_kw_refers_to_non_template)
	<< Found.getLookupName() << SS.getRange();
	Diag(ExampleLookupResult->getUnderlyingDecl()->getLocation(),
	diag::note_template_kw_refers_to_non_template)
	<< Found.getLookupName();
	return true;
	}

	return false;
	}

	if (S && !ObjectType.isNull() && !ObjectTypeSearchedInScope &&
	!getLangOpts().CPlusPlus11) {
	// C++03 [basic.lookup.classref]p1:
	// [...] If the lookup in the class of the object expression finds a
	// template, the name is also looked up in the context of the entire
	// postfix-expression and [...]
	//
	// Note: C++11 does not perform this second lookup.
	LookupResult FoundOuter(*this, Found.getLookupName(), Found.getNameLoc(),
	LookupOrdinaryName);
	FoundOuter.setTemplateNameLookup(true);
	LookupName(FoundOuter, S);
	// FIXME: We silently accept an ambiguous lookup here, in violation of
	// [basic.lookup]/1.
	FilterAcceptableTemplateNames(FoundOuter, /AllowFunctionTemplates=/false);

	NamedDecl *OuterTemplate;
	if (FoundOuter.empty()) {
	// - if the name is not found, the name found in the class of the
	// object expression is used, otherwise
	} else if (FoundOuter.isAmbiguous() \|\| !FoundOuter.isSingleResult() \|\|
	!(OuterTemplate =
	getAsTemplateNameDecl(FoundOuter.getFoundDecl()))) {
	// - if the name is found in the context of the entire
	// postfix-expression and does not name a class template, the name
	// found in the class of the object expression is used, otherwise
	FoundOuter.clear();
	} else if (!Found.isSuppressingDiagnostics()) {
	// - if the name found is a class template, it must refer to the same
	// entity as the one found in the class of the object expression,
	// otherwise the program is ill-formed.
	if (!Found.isSingleResult() \|\|
	getAsTemplateNameDecl(Found.getFoundDecl())->getCanonicalDecl() !=
	OuterTemplate->getCanonicalDecl()) {
	Diag(Found.getNameLoc(),
	diag::ext_nested_name_member_ref_lookup_ambiguous)
	<< Found.getLookupName()
	<< ObjectType;
	Diag(Found.getRepresentativeDecl()->getLocation(),
	diag::note_ambig_member_ref_object_type)
	<< ObjectType;
	Diag(FoundOuter.getFoundDecl()->getLocation(),
	diag::note_ambig_member_ref_scope);

	// Recover by taking the template that we found in the object
	// expression's type.
	}
	}
	}

	return false;
	}

	void Sema::diagnoseExprIntendedAsTemplateName(Scope *S, ExprResult TemplateName,
	SourceLocation Less,
	SourceLocation Greater) {
	if (TemplateName.isInvalid())
	return;

	DeclarationNameInfo NameInfo;
	CXXScopeSpec SS;
	LookupNameKind LookupKind;

	DeclContext *LookupCtx = nullptr;
	NamedDecl *Found = nullptr;
	bool MissingTemplateKeyword = false;

	// Figure out what name we looked up.
	if (auto *DRE = dyn_cast<DeclRefExpr>(TemplateName.get())) {
	NameInfo = DRE->getNameInfo();
	SS.Adopt(DRE->getQualifierLoc());
	LookupKind = LookupOrdinaryName;
	Found = DRE->getFoundDecl();
	} else if (auto *ME = dyn_cast<MemberExpr>(TemplateName.get())) {
	NameInfo = ME->getMemberNameInfo();
	SS.Adopt(ME->getQualifierLoc());
	LookupKind = LookupMemberName;
	LookupCtx = ME->getBase()->getType()->getAsCXXRecordDecl();
	Found = ME->getMemberDecl();
	} else if (auto *DSDRE =
	dyn_cast<DependentScopeDeclRefExpr>(TemplateName.get())) {
	NameInfo = DSDRE->getNameInfo();
	SS.Adopt(DSDRE->getQualifierLoc());
	MissingTemplateKeyword = true;
	} else if (auto *DSME =
	dyn_cast<CXXDependentScopeMemberExpr>(TemplateName.get())) {
	NameInfo = DSME->getMemberNameInfo();
	SS.Adopt(DSME->getQualifierLoc());
	MissingTemplateKeyword = true;
	} else {
	llvm_unreachable("unexpected kind of potential template name");
	}

	// If this is a dependent-scope lookup, diagnose that the 'template' keyword
	// was missing.
	if (MissingTemplateKeyword) {
	Diag(NameInfo.getBeginLoc(), diag::err_template_kw_missing)
	<< "" << NameInfo.getName().getAsString() << SourceRange(Less, Greater);
	return;
	}

	// Try to correct the name by looking for templates and C++ named casts.
	struct TemplateCandidateFilter : CorrectionCandidateCallback {
	Sema &S;
	TemplateCandidateFilter(Sema &S) : S(S) {
	WantTypeSpecifiers = false;
	WantExpressionKeywords = false;
	WantRemainingKeywords = false;
	WantCXXNamedCasts = true;
	};
	bool ValidateCandidate(const TypoCorrection &Candidate) override {
	if (auto *ND = Candidate.getCorrectionDecl())
	return S.getAsTemplateNameDecl(ND);
	return Candidate.isKeyword();
	}

	std::unique_ptr<CorrectionCandidateCallback> clone() override {
	return llvm::make_unique<TemplateCandidateFilter>(*this);
	}
	};

	DeclarationName Name = NameInfo.getName();
	TemplateCandidateFilter CCC(*this);
	if (TypoCorrection Corrected = CorrectTypo(NameInfo, LookupKind, S, &SS, CCC,
	CTK_ErrorRecovery, LookupCtx)) {
	auto *ND = Corrected.getFoundDecl();
	if (ND)
	ND = getAsTemplateNameDecl(ND);
	if (ND \|\| Corrected.isKeyword()) {
	if (LookupCtx) {
	std::string CorrectedStr(Corrected.getAsString(getLangOpts()));
	bool DroppedSpecifier = Corrected.WillReplaceSpecifier() &&
	Name.getAsString() == CorrectedStr;
	diagnoseTypo(Corrected,
	PDiag(diag::err_non_template_in_member_template_id_suggest)
	<< Name << LookupCtx << DroppedSpecifier
	<< SS.getRange(), false);
	} else {
	diagnoseTypo(Corrected,
	PDiag(diag::err_non_template_in_template_id_suggest)
	<< Name, false);
	}
	if (Found)
	Diag(Found->getLocation(),
	diag::note_non_template_in_template_id_found);
	return;
	}
	}

	Diag(NameInfo.getLoc(), diag::err_non_template_in_template_id)
	<< Name << SourceRange(Less, Greater);
	if (Found)
	Diag(Found->getLocation(), diag::note_non_template_in_template_id_found);
	}

	/// ActOnDependentIdExpression - Handle a dependent id-expression that
	/// was just parsed. This is only possible with an explicit scope
	/// specifier naming a dependent type.
	ExprResult
	Sema::ActOnDependentIdExpression(const CXXScopeSpec &SS,
	SourceLocation TemplateKWLoc,
	const DeclarationNameInfo &NameInfo,
	bool isAddressOfOperand,
	const TemplateArgumentListInfo *TemplateArgs) {
	DeclContext *DC = getFunctionLevelDeclContext();

	// C++11 [expr.prim.general]p12:
	// An id-expression that denotes a non-static data member or non-static
	// member function of a class can only be used:
	// (...)
	// - if that id-expression denotes a non-static data member and it
	// appears in an unevaluated operand.
	//
	// If this might be the case, form a DependentScopeDeclRefExpr instead of a
	// CXXDependentScopeMemberExpr. The former can instantiate to either
	// DeclRefExpr or MemberExpr depending on lookup results, while the latter is
	// always a MemberExpr.
	bool MightBeCxx11UnevalField =
	getLangOpts().CPlusPlus11 && isUnevaluatedContext();

	// Check if the nested name specifier is an enum type.
	bool IsEnum = false;
	if (NestedNameSpecifier *NNS = SS.getScopeRep())
	IsEnum = dyn_cast_or_null<EnumType>(NNS->getAsType());

	if (!MightBeCxx11UnevalField && !isAddressOfOperand && !IsEnum &&
	isa<CXXMethodDecl>(DC) && cast<CXXMethodDecl>(DC)->isInstance()) {
	QualType ThisType = cast<CXXMethodDecl>(DC)->getThisType();

	// Since the 'this' expression is synthesized, we don't need to
	// perform the double-lookup check.
	NamedDecl *FirstQualifierInScope = nullptr;

	return CXXDependentScopeMemberExpr::Create(
	Context, /This/ nullptr, ThisType, /IsArrow/ true,
	/Op/ SourceLocation(), SS.getWithLocInContext(Context), TemplateKWLoc,
	FirstQualifierInScope, NameInfo, TemplateArgs);
	}

	return BuildDependentDeclRefExpr(SS, TemplateKWLoc, NameInfo, TemplateArgs);
	}

	ExprResult
	Sema::BuildDependentDeclRefExpr(const CXXScopeSpec &SS,
	SourceLocation TemplateKWLoc,
	const DeclarationNameInfo &NameInfo,
	const TemplateArgumentListInfo *TemplateArgs) {
	return DependentScopeDeclRefExpr::Create(
	Context, SS.getWithLocInContext(Context), TemplateKWLoc, NameInfo,
	TemplateArgs);
	}


	/// Determine whether we would be unable to instantiate this template (because
	/// it either has no definition, or is in the process of being instantiated).
	bool Sema::DiagnoseUninstantiableTemplate(SourceLocation PointOfInstantiation,
	NamedDecl *Instantiation,
	bool InstantiatedFromMember,
	const NamedDecl *Pattern,
	const NamedDecl *PatternDef,
	TemplateSpecializationKind TSK,
	bool Complain /= true/) {
	assert(isa<TagDecl>(Instantiation) \|\| isa<FunctionDecl>(Instantiation) \|\|
	isa<VarDecl>(Instantiation));

	bool IsEntityBeingDefined = false;
	if (const TagDecl *TD = dyn_cast_or_null<TagDecl>(PatternDef))
	IsEntityBeingDefined = TD->isBeingDefined();

	if (PatternDef && !IsEntityBeingDefined) {
	NamedDecl *SuggestedDef = nullptr;
	if (!hasVisibleDefinition(const_cast<NamedDecl*>(PatternDef), &SuggestedDef,
	/OnlyNeedComplete/false)) {
	// If we're allowed to diagnose this and recover, do so.
	bool Recover = Complain && !isSFINAEContext();
	if (Complain)
	diagnoseMissingImport(PointOfInstantiation, SuggestedDef,
	Sema::MissingImportKind::Definition, Recover);
	return !Recover;
	}
	return false;
	}

	if (!Complain \|\| (PatternDef && PatternDef->isInvalidDecl()))
	return true;

	llvm::Optional<unsigned> Note;
	QualType InstantiationTy;
	if (TagDecl *TD = dyn_cast<TagDecl>(Instantiation))
	InstantiationTy = Context.getTypeDeclType(TD);
	if (PatternDef) {
	Diag(PointOfInstantiation,
	diag::err_template_instantiate_within_definition)
	<< /implicit\|explicit/(TSK != TSK_ImplicitInstantiation)
	<< InstantiationTy;
	// Not much point in noting the template declaration here, since
	// we're lexically inside it.
	Instantiation->setInvalidDecl();
	} else if (InstantiatedFromMember) {
	if (isa<FunctionDecl>(Instantiation)) {
	Diag(PointOfInstantiation,
	diag::err_explicit_instantiation_undefined_member)
	<< /member function/ 1 << Instantiation->getDeclName()
	<< Instantiation->getDeclContext();
	Note = diag::note_explicit_instantiation_here;
	} else {
	assert(isa<TagDecl>(Instantiation) && "Must be a TagDecl!");
	Diag(PointOfInstantiation,
	diag::err_implicit_instantiate_member_undefined)
	<< InstantiationTy;
	Note = diag::note_member_declared_at;
	}
	} else {
	if (isa<FunctionDecl>(Instantiation)) {
	Diag(PointOfInstantiation,
	diag::err_explicit_instantiation_undefined_func_template)
	<< Pattern;
	Note = diag::note_explicit_instantiation_here;
	} else if (isa<TagDecl>(Instantiation)) {
	Diag(PointOfInstantiation, diag::err_template_instantiate_undefined)
	<< (TSK != TSK_ImplicitInstantiation)
	<< InstantiationTy;
	Note = diag::note_template_decl_here;
	} else {
	assert(isa<VarDecl>(Instantiation) && "Must be a VarDecl!");
	if (isa<VarTemplateSpecializationDecl>(Instantiation)) {
	Diag(PointOfInstantiation,
	diag::err_explicit_instantiation_undefined_var_template)
	<< Instantiation;
	Instantiation->setInvalidDecl();
	} else
	Diag(PointOfInstantiation,
	diag::err_explicit_instantiation_undefined_member)
	<< /static data member/ 2 << Instantiation->getDeclName()
	<< Instantiation->getDeclContext();
	Note = diag::note_explicit_instantiation_here;
	}
	}
	if (Note) // Diagnostics were emitted.
	Diag(Pattern->getLocation(), Note.getValue());

	// In general, Instantiation isn't marked invalid to get more than one
	// error for multiple undefined instantiations. But the code that does
	// explicit declaration -> explicit definition conversion can't handle
	// invalid declarations, so mark as invalid in that case.
	if (TSK == TSK_ExplicitInstantiationDeclaration)
	Instantiation->setInvalidDecl();
	return true;
	}

	/// DiagnoseTemplateParameterShadow - Produce a diagnostic complaining
	/// that the template parameter 'PrevDecl' is being shadowed by a new
	/// declaration at location Loc. Returns true to indicate that this is
	/// an error, and false otherwise.
	void Sema::DiagnoseTemplateParameterShadow(SourceLocation Loc, Decl *PrevDecl) {
	assert(PrevDecl->isTemplateParameter() && "Not a template parameter");

	// Microsoft Visual C++ permits template parameters to be shadowed.
	if (getLangOpts().MicrosoftExt)
	return;

	// C++ [temp.local]p4:
	// A template-parameter shall not be redeclared within its
	// scope (including nested scopes).
	Diag(Loc, diag::err_template_param_shadow)
	<< cast<NamedDecl>(PrevDecl)->getDeclName();
	Diag(PrevDecl->getLocation(), diag::note_template_param_here);
	}

	/// AdjustDeclIfTemplate - If the given decl happens to be a template, reset
	/// the parameter D to reference the templated declaration and return a pointer
	/// to the template declaration. Otherwise, do nothing to D and return null.
	TemplateDecl Sema::AdjustDeclIfTemplate(Decl &D) {
	if (TemplateDecl *Temp = dyn_cast_or_null<TemplateDecl>(D)) {
	D = Temp->getTemplatedDecl();
	return Temp;
	}
	return nullptr;
	}

	ParsedTemplateArgument ParsedTemplateArgument::getTemplatePackExpansion(
	SourceLocation EllipsisLoc) const {
	assert(Kind == Template &&
	"Only template template arguments can be pack expansions here");
	assert(getAsTemplate().get().containsUnexpandedParameterPack() &&
	"Template template argument pack expansion without packs");
	ParsedTemplateArgument Result(*this);
	Result.EllipsisLoc = EllipsisLoc;
	return Result;
	}

	static TemplateArgumentLoc translateTemplateArgument(Sema &SemaRef,
	const ParsedTemplateArgument &Arg) {

	switch (Arg.getKind()) {
	case ParsedTemplateArgument::Type: {
	TypeSourceInfo *DI;
	QualType T = SemaRef.GetTypeFromParser(Arg.getAsType(), &DI);
	if (!DI)
	DI = SemaRef.Context.getTrivialTypeSourceInfo(T, Arg.getLocation());
	return TemplateArgumentLoc(TemplateArgument(T), DI);
	}

	case ParsedTemplateArgument::NonType: {
	Expr E = static_cast<Expr >(Arg.getAsExpr());
	return TemplateArgumentLoc(TemplateArgument(E), E);
	}

	case ParsedTemplateArgument::Template: {
	TemplateName Template = Arg.getAsTemplate().get();
	TemplateArgument TArg;
	if (Arg.getEllipsisLoc().isValid())
	TArg = TemplateArgument(Template, Optional<unsigned int>());
	else
	TArg = Template;
	return TemplateArgumentLoc(TArg,
	Arg.getScopeSpec().getWithLocInContext(
	SemaRef.Context),
	Arg.getLocation(),
	Arg.getEllipsisLoc());
	}
	}

	llvm_unreachable("Unhandled parsed template argument");
	}

	/// Translates template arguments as provided by the parser
	/// into template arguments used by semantic analysis.
	void Sema::translateTemplateArguments(const ASTTemplateArgsPtr &TemplateArgsIn,
	TemplateArgumentListInfo &TemplateArgs) {
	for (unsigned I = 0, Last = TemplateArgsIn.size(); I != Last; ++I)
	TemplateArgs.addArgument(translateTemplateArgument(*this,
	TemplateArgsIn[I]));
	}

	static void maybeDiagnoseTemplateParameterShadow(Sema &SemaRef, Scope *S,
	SourceLocation Loc,
	IdentifierInfo *Name) {
	NamedDecl *PrevDecl = SemaRef.LookupSingleName(
	S, Name, Loc, Sema::LookupOrdinaryName, Sema::ForVisibleRedeclaration);
	if (PrevDecl && PrevDecl->isTemplateParameter())
	SemaRef.DiagnoseTemplateParameterShadow(Loc, PrevDecl);
	}

	/// Convert a parsed type into a parsed template argument. This is mostly
	/// trivial, except that we may have parsed a C++17 deduced class template
	/// specialization type, in which case we should form a template template
	/// argument instead of a type template argument.
	ParsedTemplateArgument Sema::ActOnTemplateTypeArgument(TypeResult ParsedType) {
	TypeSourceInfo *TInfo;
	QualType T = GetTypeFromParser(ParsedType.get(), &TInfo);
	if (T.isNull())
	return ParsedTemplateArgument();
	assert(TInfo && "template argument with no location");

	// If we might have formed a deduced template specialization type, convert
	// it to a template template argument.
	if (getLangOpts().CPlusPlus17) {
	TypeLoc TL = TInfo->getTypeLoc();
	SourceLocation EllipsisLoc;
	if (auto PET = TL.getAs<PackExpansionTypeLoc>()) {
	EllipsisLoc = PET.getEllipsisLoc();
	TL = PET.getPatternLoc();
	}

	CXXScopeSpec SS;
	if (auto ET = TL.getAs<ElaboratedTypeLoc>()) {
	SS.Adopt(ET.getQualifierLoc());
	TL = ET.getNamedTypeLoc();
	}

	if (auto DTST = TL.getAs<DeducedTemplateSpecializationTypeLoc>()) {
	TemplateName Name = DTST.getTypePtr()->getTemplateName();
	if (SS.isSet())
	Name = Context.getQualifiedTemplateName(SS.getScopeRep(),
	/HasTemplateKeyword/ false,
	Name.getAsTemplateDecl());
	ParsedTemplateArgument Result(SS, TemplateTy::make(Name),
	DTST.getTemplateNameLoc());
	if (EllipsisLoc.isValid())
	Result = Result.getTemplatePackExpansion(EllipsisLoc);
	return Result;
	}
	}

	// This is a normal type template argument. Note, if the type template
	// argument is an injected-class-name for a template, it has a dual nature
	// and can be used as either a type or a template. We handle that in
	// convertTypeTemplateArgumentToTemplate.
	return ParsedTemplateArgument(ParsedTemplateArgument::Type,
	ParsedType.get().getAsOpaquePtr(),
	TInfo->getTypeLoc().getBeginLoc());
	}

	/// ActOnTypeParameter - Called when a C++ template type parameter
	/// (e.g., "typename T") has been parsed. Typename specifies whether
	/// the keyword "typename" was used to declare the type parameter
	/// (otherwise, "class" was used), and KeyLoc is the location of the
	/// "class" or "typename" keyword. ParamName is the name of the
	/// parameter (NULL indicates an unnamed template parameter) and
	/// ParamNameLoc is the location of the parameter name (if any).
	/// If the type parameter has a default argument, it will be added
	/// later via ActOnTypeParameterDefault.
	NamedDecl Sema::ActOnTypeParameter(Scope S, bool Typename,
	SourceLocation EllipsisLoc,
	SourceLocation KeyLoc,
	IdentifierInfo *ParamName,
	SourceLocation ParamNameLoc,
	unsigned Depth, unsigned Position,
	SourceLocation EqualLoc,
	ParsedType DefaultArg) {
	assert(S->isTemplateParamScope() &&
	"Template type parameter not in template parameter scope!");

	SourceLocation Loc = ParamNameLoc;
	if (!ParamName)
	Loc = KeyLoc;

	bool IsParameterPack = EllipsisLoc.isValid();
	TemplateTypeParmDecl *Param
	= TemplateTypeParmDecl::Create(Context, Context.getTranslationUnitDecl(),
	KeyLoc, Loc, Depth, Position, ParamName,
	Typename, IsParameterPack);
	Param->setAccess(AS_public);

	if (ParamName) {
	maybeDiagnoseTemplateParameterShadow(*this, S, ParamNameLoc, ParamName);

	// Add the template parameter into the current scope.
	S->AddDecl(Param);
	IdResolver.AddDecl(Param);
	}

	// C++0x [temp.param]p9:
	// A default template-argument may be specified for any kind of
	// template-parameter that is not a template parameter pack.
	if (DefaultArg && IsParameterPack) {
	Diag(EqualLoc, diag::err_template_param_pack_default_arg);
	DefaultArg = nullptr;
	}

	// Handle the default argument, if provided.
	if (DefaultArg) {
	TypeSourceInfo *DefaultTInfo;
	GetTypeFromParser(DefaultArg, &DefaultTInfo);

	assert(DefaultTInfo && "expected source information for type");

	// Check for unexpanded parameter packs.
	if (DiagnoseUnexpandedParameterPack(Loc, DefaultTInfo,
	UPPC_DefaultArgument))
	return Param;

	// Check the template argument itself.
	if (CheckTemplateArgument(Param, DefaultTInfo)) {
	Param->setInvalidDecl();
	return Param;
	}

	Param->setDefaultArgument(DefaultTInfo);
	}

	return Param;
	}

	/// Check that the type of a non-type template parameter is
	/// well-formed.
	///
	/// \returns the (possibly-promoted) parameter type if valid;
	/// otherwise, produces a diagnostic and returns a NULL type.
	QualType Sema::CheckNonTypeTemplateParameterType(TypeSourceInfo *&TSI,
	SourceLocation Loc) {
	if (TSI->getType()->isUndeducedType()) {
	// C++17 [temp.dep.expr]p3:
	// An id-expression is type-dependent if it contains
	// - an identifier associated by name lookup with a non-type
	// template-parameter declared with a type that contains a
	// placeholder type (7.1.7.4),
	TSI = SubstAutoTypeSourceInfo(TSI, Context.DependentTy);
	}

	return CheckNonTypeTemplateParameterType(TSI->getType(), Loc);
	}

	QualType Sema::CheckNonTypeTemplateParameterType(QualType T,
	SourceLocation Loc) {
	// We don't allow variably-modified types as the type of non-type template
	// parameters.
	if (T->isVariablyModifiedType()) {
	Diag(Loc, diag::err_variably_modified_nontype_template_param)
	<< T;
	return QualType();
	}

	// C++ [temp.param]p4:
	//
	// A non-type template-parameter shall have one of the following
	// (optionally cv-qualified) types:
	//
	// -- integral or enumeration type,
	if (T->isIntegralOrEnumerationType() \|\|
	// -- pointer to object or pointer to function,
	T->isPointerType() \|\|
	// -- reference to object or reference to function,
	T->isReferenceType() \|\|
	// -- pointer to member,
	T->isMemberPointerType() \|\|
	// -- std::nullptr_t.
	T->isNullPtrType() \|\|
	// If T is a dependent type, we can't do the check now, so we
	// assume that it is well-formed.
	T->isDependentType() \|\|
	// Allow use of auto in template parameter declarations.
	T->isUndeducedType()) {
	// C++ [temp.param]p5: The top-level cv-qualifiers on the template-parameter
	// are ignored when determining its type.
	return T.getUnqualifiedType();
	}

	// C++ [temp.param]p8:
	//
	// A non-type template-parameter of type "array of T" or
	// "function returning T" is adjusted to be of type "pointer to
	// T" or "pointer to function returning T", respectively.
	else if (T->isArrayType() \|\| T->isFunctionType())
	return Context.getDecayedType(T);

	Diag(Loc, diag::err_template_nontype_parm_bad_type)
	<< T;

	return QualType();
	}

	NamedDecl Sema::ActOnNonTypeTemplateParameter(Scope S, Declarator &D,
	unsigned Depth,
	unsigned Position,
	SourceLocation EqualLoc,
	Expr *Default) {
	TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);

	// Check that we have valid decl-specifiers specified.
	auto CheckValidDeclSpecifiers = [this, &D] {
	// C++ [temp.param]
	// p1
	// template-parameter:
	// ...
	// parameter-declaration
	// p2
	// ... A storage class shall not be specified in a template-parameter
	// declaration.
	// [dcl.typedef]p1:
	// The typedef specifier [...] shall not be used in the decl-specifier-seq
	// of a parameter-declaration
	const DeclSpec &DS = D.getDeclSpec();
	auto EmitDiag = [this](SourceLocation Loc) {
	Diag(Loc, diag::err_invalid_decl_specifier_in_nontype_parm)
	<< FixItHint::CreateRemoval(Loc);
	};
	if (DS.getStorageClassSpec() != DeclSpec::SCS_unspecified)
	EmitDiag(DS.getStorageClassSpecLoc());

	if (DS.getThreadStorageClassSpec() != TSCS_unspecified)
	EmitDiag(DS.getThreadStorageClassSpecLoc());

	// [dcl.inline]p1:
	// The inline specifier can be applied only to the declaration or
	// definition of a variable or function.

	if (DS.isInlineSpecified())
	EmitDiag(DS.getInlineSpecLoc());

	// [dcl.constexpr]p1:
	// The constexpr specifier shall be applied only to the definition of a
	// variable or variable template or the declaration of a function or
	// function template.

	if (DS.hasConstexprSpecifier())
	EmitDiag(DS.getConstexprSpecLoc());

	// [dcl.fct.spec]p1:
	// Function-specifiers can be used only in function declarations.

	if (DS.isVirtualSpecified())
	EmitDiag(DS.getVirtualSpecLoc());

	if (DS.hasExplicitSpecifier())
	EmitDiag(DS.getExplicitSpecLoc());

	if (DS.isNoreturnSpecified())
	EmitDiag(DS.getNoreturnSpecLoc());
	};

	CheckValidDeclSpecifiers();

	if (TInfo->getType()->isUndeducedType()) {
	Diag(D.getIdentifierLoc(),
	diag::warn_cxx14_compat_template_nontype_parm_auto_type)
	<< QualType(TInfo->getType()->getContainedAutoType(), 0);
	}

	assert(S->isTemplateParamScope() &&
	"Non-type template parameter not in template parameter scope!");
	bool Invalid = false;

	QualType T = CheckNonTypeTemplateParameterType(TInfo, D.getIdentifierLoc());
	if (T.isNull()) {
	T = Context.IntTy; // Recover with an 'int' type.
	Invalid = true;
	}

	CheckFunctionOrTemplateParamDeclarator(S, D);

	IdentifierInfo *ParamName = D.getIdentifier();
	bool IsParameterPack = D.hasEllipsis();
	NonTypeTemplateParmDecl *Param = NonTypeTemplateParmDecl::Create(
	Context, Context.getTranslationUnitDecl(), D.getBeginLoc(),
	D.getIdentifierLoc(), Depth, Position, ParamName, T, IsParameterPack,
	TInfo);
	Param->setAccess(AS_public);

	if (Invalid)
	Param->setInvalidDecl();

	if (ParamName) {
	maybeDiagnoseTemplateParameterShadow(*this, S, D.getIdentifierLoc(),
	ParamName);

	// Add the template parameter into the current scope.
	S->AddDecl(Param);
	IdResolver.AddDecl(Param);
	}

	// C++0x [temp.param]p9:
	// A default template-argument may be specified for any kind of
	// template-parameter that is not a template parameter pack.
	if (Default && IsParameterPack) {
	Diag(EqualLoc, diag::err_template_param_pack_default_arg);
	Default = nullptr;
	}

	// Check the well-formedness of the default template argument, if provided.
	if (Default) {
	// Check for unexpanded parameter packs.
	if (DiagnoseUnexpandedParameterPack(Default, UPPC_DefaultArgument))
	return Param;

	TemplateArgument Converted;
	ExprResult DefaultRes =
	CheckTemplateArgument(Param, Param->getType(), Default, Converted);
	if (DefaultRes.isInvalid()) {
	Param->setInvalidDecl();
	return Param;
	}
	Default = DefaultRes.get();

	Param->setDefaultArgument(Default);
	}

	return Param;
	}

	/// ActOnTemplateTemplateParameter - Called when a C++ template template
	/// parameter (e.g. T in template <template \<typename> class T> class array)
	/// has been parsed. S is the current scope.
	NamedDecl Sema::ActOnTemplateTemplateParameter(Scope S,
	SourceLocation TmpLoc,
	TemplateParameterList *Params,
	SourceLocation EllipsisLoc,
	IdentifierInfo *Name,
	SourceLocation NameLoc,
	unsigned Depth,
	unsigned Position,
	SourceLocation EqualLoc,
	ParsedTemplateArgument Default) {
	assert(S->isTemplateParamScope() &&
	"Template template parameter not in template parameter scope!");

	// Construct the parameter object.
	bool IsParameterPack = EllipsisLoc.isValid();
	TemplateTemplateParmDecl *Param =
	TemplateTemplateParmDecl::Create(Context, Context.getTranslationUnitDecl(),
	NameLoc.isInvalid()? TmpLoc : NameLoc,
	Depth, Position, IsParameterPack,
	Name, Params);
	Param->setAccess(AS_public);

	// If the template template parameter has a name, then link the identifier
	// into the scope and lookup mechanisms.
	if (Name) {
	maybeDiagnoseTemplateParameterShadow(*this, S, NameLoc, Name);

	S->AddDecl(Param);
	IdResolver.AddDecl(Param);
	}

	if (Params->size() == 0) {
	Diag(Param->getLocation(), diag::err_template_template_parm_no_parms)
	<< SourceRange(Params->getLAngleLoc(), Params->getRAngleLoc());
	Param->setInvalidDecl();
	}

	// C++0x [temp.param]p9:
	// A default template-argument may be specified for any kind of
	// template-parameter that is not a template parameter pack.
	if (IsParameterPack && !Default.isInvalid()) {
	Diag(EqualLoc, diag::err_template_param_pack_default_arg);
	Default = ParsedTemplateArgument();
	}

	if (!Default.isInvalid()) {
	// Check only that we have a template template argument. We don't want to
	// try to check well-formedness now, because our template template parameter
	// might have dependent types in its template parameters, which we wouldn't
	// be able to match now.
	//
	// If none of the template template parameter's template arguments mention
	// other template parameters, we could actually perform more checking here.
	// However, it isn't worth doing.
	TemplateArgumentLoc DefaultArg = translateTemplateArgument(*this, Default);
	if (DefaultArg.getArgument().getAsTemplate().isNull()) {
	Diag(DefaultArg.getLocation(), diag::err_template_arg_not_valid_template)
	<< DefaultArg.getSourceRange();
	return Param;
	}

	// Check for unexpanded parameter packs.
	if (DiagnoseUnexpandedParameterPack(DefaultArg.getLocation(),
	DefaultArg.getArgument().getAsTemplate(),
	UPPC_DefaultArgument))
	return Param;

	Param->setDefaultArgument(Context, DefaultArg);
	}

	return Param;
	}

	/// ActOnTemplateParameterList - Builds a TemplateParameterList, optionally
	/// constrained by RequiresClause, that contains the template parameters in
	/// Params.
	TemplateParameterList *
	Sema::ActOnTemplateParameterList(unsigned Depth,
	SourceLocation ExportLoc,
	SourceLocation TemplateLoc,
	SourceLocation LAngleLoc,
	ArrayRef<NamedDecl *> Params,
	SourceLocation RAngleLoc,
	Expr *RequiresClause) {
	if (ExportLoc.isValid())
	Diag(ExportLoc, diag::warn_template_export_unsupported);

	return TemplateParameterList::Create(
	Context, TemplateLoc, LAngleLoc,
	llvm::makeArrayRef(Params.data(), Params.size()),
	RAngleLoc, RequiresClause);
	}

	static void SetNestedNameSpecifier(Sema &S, TagDecl *T,
	const CXXScopeSpec &SS) {
	if (SS.isSet())
	T->setQualifierInfo(SS.getWithLocInContext(S.Context));
	}

	DeclResult Sema::CheckClassTemplate(
	Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
	CXXScopeSpec &SS, IdentifierInfo *Name, SourceLocation NameLoc,
	const ParsedAttributesView &Attr, TemplateParameterList *TemplateParams,
	AccessSpecifier AS, SourceLocation ModulePrivateLoc,
	SourceLocation FriendLoc, unsigned NumOuterTemplateParamLists,
	TemplateParameterList *OuterTemplateParamLists, SkipBodyInfo SkipBody) {
	assert(TemplateParams && TemplateParams->size() > 0 &&
	"No template parameters");
	assert(TUK != TUK_Reference && "Can only declare or define class templates");
	bool Invalid = false;

	// Check that we can declare a template here.
	if (CheckTemplateDeclScope(S, TemplateParams))
	return true;

	TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
	assert(Kind != TTK_Enum && "can't build template of enumerated type");

	// There is no such thing as an unnamed class template.
	if (!Name) {
	Diag(KWLoc, diag::err_template_unnamed_class);
	return true;
	}

	// Find any previous declaration with this name. For a friend with no
	// scope explicitly specified, we only look for tag declarations (per
	// C++11 [basic.lookup.elab]p2).
	DeclContext *SemanticContext;
	LookupResult Previous(*this, Name, NameLoc,
	(SS.isEmpty() && TUK == TUK_Friend)
	? LookupTagName : LookupOrdinaryName,
	forRedeclarationInCurContext());
	if (SS.isNotEmpty() && !SS.isInvalid()) {
	SemanticContext = computeDeclContext(SS, true);
	if (!SemanticContext) {
	// FIXME: Horrible, horrible hack! We can't currently represent this
	// in the AST, and historically we have just ignored such friend
	// class templates, so don't complain here.
	Diag(NameLoc, TUK == TUK_Friend
	? diag::warn_template_qualified_friend_ignored
	: diag::err_template_qualified_declarator_no_match)
	<< SS.getScopeRep() << SS.getRange();
	return TUK != TUK_Friend;
	}

	if (RequireCompleteDeclContext(SS, SemanticContext))
	return true;

	// If we're adding a template to a dependent context, we may need to
	// rebuilding some of the types used within the template parameter list,
	// now that we know what the current instantiation is.
	if (SemanticContext->isDependentContext()) {
	ContextRAII SavedContext(*this, SemanticContext);
	if (RebuildTemplateParamsInCurrentInstantiation(TemplateParams))
	Invalid = true;
	} else if (TUK != TUK_Friend && TUK != TUK_Reference)
	diagnoseQualifiedDeclaration(SS, SemanticContext, Name, NameLoc, false);

	LookupQualifiedName(Previous, SemanticContext);
	} else {
	SemanticContext = CurContext;

	// C++14 [class.mem]p14:
	// If T is the name of a class, then each of the following shall have a
	// name different from T:
	// -- every member template of class T
	if (TUK != TUK_Friend &&
	DiagnoseClassNameShadow(SemanticContext,
	DeclarationNameInfo(Name, NameLoc)))
	return true;

	LookupName(Previous, S);
	}

	if (Previous.isAmbiguous())
	return true;

	NamedDecl *PrevDecl = nullptr;
	if (Previous.begin() != Previous.end())
	PrevDecl = (*Previous.begin())->getUnderlyingDecl();

	if (PrevDecl && PrevDecl->isTemplateParameter()) {
	// Maybe we will complain about the shadowed template parameter.
	DiagnoseTemplateParameterShadow(NameLoc, PrevDecl);
	// Just pretend that we didn't see the previous declaration.
	PrevDecl = nullptr;
	}

	// If there is a previous declaration with the same name, check
	// whether this is a valid redeclaration.
	ClassTemplateDecl *PrevClassTemplate =
	dyn_cast_or_null<ClassTemplateDecl>(PrevDecl);

	// We may have found the injected-class-name of a class template,
	// class template partial specialization, or class template specialization.
	// In these cases, grab the template that is being defined or specialized.
	if (!PrevClassTemplate && PrevDecl && isa<CXXRecordDecl>(PrevDecl) &&
	cast<CXXRecordDecl>(PrevDecl)->isInjectedClassName()) {
	PrevDecl = cast<CXXRecordDecl>(PrevDecl->getDeclContext());
	PrevClassTemplate
	= cast<CXXRecordDecl>(PrevDecl)->getDescribedClassTemplate();
	if (!PrevClassTemplate && isa<ClassTemplateSpecializationDecl>(PrevDecl)) {
	PrevClassTemplate
	= cast<ClassTemplateSpecializationDecl>(PrevDecl)
	->getSpecializedTemplate();
	}
	}

	if (TUK == TUK_Friend) {
	// C++ [namespace.memdef]p3:
	// [...] When looking for a prior declaration of a class or a function
	// declared as a friend, and when the name of the friend class or
	// function is neither a qualified name nor a template-id, scopes outside
	// the innermost enclosing namespace scope are not considered.
	if (!SS.isSet()) {
	DeclContext *OutermostContext = CurContext;
	while (!OutermostContext->isFileContext())
	OutermostContext = OutermostContext->getLookupParent();

	if (PrevDecl &&
	(OutermostContext->Equals(PrevDecl->getDeclContext()) \|\|
	OutermostContext->Encloses(PrevDecl->getDeclContext()))) {
	SemanticContext = PrevDecl->getDeclContext();
	} else {
	// Declarations in outer scopes don't matter. However, the outermost
	// context we computed is the semantic context for our new
	// declaration.
	PrevDecl = PrevClassTemplate = nullptr;
	SemanticContext = OutermostContext;

	// Check that the chosen semantic context doesn't already contain a
	// declaration of this name as a non-tag type.
	Previous.clear(LookupOrdinaryName);
	DeclContext *LookupContext = SemanticContext;
	while (LookupContext->isTransparentContext())
	LookupContext = LookupContext->getLookupParent();
	LookupQualifiedName(Previous, LookupContext);

	if (Previous.isAmbiguous())
	return true;

	if (Previous.begin() != Previous.end())
	PrevDecl = (*Previous.begin())->getUnderlyingDecl();
	}
	}
	} else if (PrevDecl &&
	!isDeclInScope(Previous.getRepresentativeDecl(), SemanticContext,
	S, SS.isValid()))
	PrevDecl = PrevClassTemplate = nullptr;

	if (auto *Shadow = dyn_cast_or_null<UsingShadowDecl>(
	PrevDecl ? Previous.getRepresentativeDecl() : nullptr)) {
	if (SS.isEmpty() &&
	!(PrevClassTemplate &&
	PrevClassTemplate->getDeclContext()->getRedeclContext()->Equals(
	SemanticContext->getRedeclContext()))) {
	Diag(KWLoc, diag::err_using_decl_conflict_reverse);
	Diag(Shadow->getTargetDecl()->getLocation(),
	diag::note_using_decl_target);
	Diag(Shadow->getUsingDecl()->getLocation(), diag::note_using_decl) << 0;
	// Recover by ignoring the old declaration.
	PrevDecl = PrevClassTemplate = nullptr;
	}
	}

	// TODO Memory management; associated constraints are not always stored.
	Expr *const CurAC = formAssociatedConstraints(TemplateParams, nullptr);

	if (PrevClassTemplate) {
	// Ensure that the template parameter lists are compatible. Skip this check
	// for a friend in a dependent context: the template parameter list itself
	// could be dependent.
	if (!(TUK == TUK_Friend && CurContext->isDependentContext()) &&
	!TemplateParameterListsAreEqual(TemplateParams,
	PrevClassTemplate->getTemplateParameters(),
	/Complain=/true,
	TPL_TemplateMatch))
	return true;

	// Check for matching associated constraints on redeclarations.
	const Expr *const PrevAC = PrevClassTemplate->getAssociatedConstraints();
	const bool RedeclACMismatch = [&] {
	if (!(CurAC \|\| PrevAC))
	return false; // Nothing to check; no mismatch.
	if (CurAC && PrevAC) {
	llvm::FoldingSetNodeID CurACInfo, PrevACInfo;
	CurAC->Profile(CurACInfo, Context, /Canonical=/true);
	PrevAC->Profile(PrevACInfo, Context, /Canonical=/true);
	if (CurACInfo == PrevACInfo)
	return false; // All good; no mismatch.
	}
	return true;
	}();

	if (RedeclACMismatch) {
	Diag(CurAC ? CurAC->getBeginLoc() : NameLoc,
	diag::err_template_different_associated_constraints);
	Diag(PrevAC ? PrevAC->getBeginLoc() : PrevClassTemplate->getLocation(),
	diag::note_template_prev_declaration)
	<< /declaration/ 0;
	return true;
	}

	// C++ [temp.class]p4:
	// In a redeclaration, partial specialization, explicit
	// specialization or explicit instantiation of a class template,
	// the class-key shall agree in kind with the original class
	// template declaration (7.1.5.3).
	RecordDecl *PrevRecordDecl = PrevClassTemplate->getTemplatedDecl();
	if (!isAcceptableTagRedeclaration(PrevRecordDecl, Kind,
	TUK == TUK_Definition, KWLoc, Name)) {
	Diag(KWLoc, diag::err_use_with_wrong_tag)
	<< Name
	<< FixItHint::CreateReplacement(KWLoc, PrevRecordDecl->getKindName());
	Diag(PrevRecordDecl->getLocation(), diag::note_previous_use);
	Kind = PrevRecordDecl->getTagKind();
	}

	// Check for redefinition of this class template.
	if (TUK == TUK_Definition) {
	if (TagDecl *Def = PrevRecordDecl->getDefinition()) {
	// If we have a prior definition that is not visible, treat this as
	// simply making that previous definition visible.
	NamedDecl *Hidden = nullptr;
	if (SkipBody && !hasVisibleDefinition(Def, &Hidden)) {
	SkipBody->ShouldSkip = true;
	SkipBody->Previous = Def;
	auto *Tmpl = cast<CXXRecordDecl>(Hidden)->getDescribedClassTemplate();
	assert(Tmpl && "original definition of a class template is not a "
	"class template?");
	makeMergedDefinitionVisible(Hidden);
	makeMergedDefinitionVisible(Tmpl);
	} else {
	Diag(NameLoc, diag::err_redefinition) << Name;
	Diag(Def->getLocation(), diag::note_previous_definition);
	// FIXME: Would it make sense to try to "forget" the previous
	// definition, as part of error recovery?
	return true;
	}
	}
	}
	} else if (PrevDecl) {
	// C++ [temp]p5:
	// A class template shall not have the same name as any other
	// template, class, function, object, enumeration, enumerator,
	// namespace, or type in the same scope (3.3), except as specified
	// in (14.5.4).
	Diag(NameLoc, diag::err_redefinition_different_kind) << Name;
	Diag(PrevDecl->getLocation(), diag::note_previous_definition);
	return true;
	}

	// Check the template parameter list of this declaration, possibly
	// merging in the template parameter list from the previous class
	// template declaration. Skip this check for a friend in a dependent
	// context, because the template parameter list might be dependent.
	if (!(TUK == TUK_Friend && CurContext->isDependentContext()) &&
	CheckTemplateParameterList(
	TemplateParams,
	PrevClassTemplate
	? PrevClassTemplate->getMostRecentDecl()->getTemplateParameters()
	: nullptr,
	(SS.isSet() && SemanticContext && SemanticContext->isRecord() &&
	SemanticContext->isDependentContext())
	? TPC_ClassTemplateMember
	: TUK == TUK_Friend ? TPC_FriendClassTemplate : TPC_ClassTemplate,
	SkipBody))
	Invalid = true;

	if (SS.isSet()) {
	// If the name of the template was qualified, we must be defining the
	// template out-of-line.
	if (!SS.isInvalid() && !Invalid && !PrevClassTemplate) {
	Diag(NameLoc, TUK == TUK_Friend ? diag::err_friend_decl_does_not_match
	: diag::err_member_decl_does_not_match)
	<< Name << SemanticContext << /IsDefinition/true << SS.getRange();
	Invalid = true;
	}
	}

	// If this is a templated friend in a dependent context we should not put it
	// on the redecl chain. In some cases, the templated friend can be the most
	// recent declaration tricking the template instantiator to make substitutions
	// there.
	// FIXME: Figure out how to combine with shouldLinkDependentDeclWithPrevious
	bool ShouldAddRedecl
	= !(TUK == TUK_Friend && CurContext->isDependentContext());

	CXXRecordDecl *NewClass =
	CXXRecordDecl::Create(Context, Kind, SemanticContext, KWLoc, NameLoc, Name,
	PrevClassTemplate && ShouldAddRedecl ?
	PrevClassTemplate->getTemplatedDecl() : nullptr,
	/DelayTypeCreation=/true);
	SetNestedNameSpecifier(*this, NewClass, SS);
	if (NumOuterTemplateParamLists > 0)
	NewClass->setTemplateParameterListsInfo(
	Context, llvm::makeArrayRef(OuterTemplateParamLists,
	NumOuterTemplateParamLists));

	// Add alignment attributes if necessary; these attributes are checked when
	// the ASTContext lays out the structure.
	if (TUK == TUK_Definition && (!SkipBody \|\| !SkipBody->ShouldSkip)) {
	AddAlignmentAttributesForRecord(NewClass);
	AddMsStructLayoutForRecord(NewClass);
	}

	// Attach the associated constraints when the declaration will not be part of
	// a decl chain.
	Expr *const ACtoAttach =
	PrevClassTemplate && ShouldAddRedecl ? nullptr : CurAC;

	ClassTemplateDecl *NewTemplate
	= ClassTemplateDecl::Create(Context, SemanticContext, NameLoc,
	DeclarationName(Name), TemplateParams,
	NewClass, ACtoAttach);

	if (ShouldAddRedecl)
	NewTemplate->setPreviousDecl(PrevClassTemplate);

	NewClass->setDescribedClassTemplate(NewTemplate);

	if (ModulePrivateLoc.isValid())
	NewTemplate->setModulePrivate();

	// Build the type for the class template declaration now.
	QualType T = NewTemplate->getInjectedClassNameSpecialization();
	T = Context.getInjectedClassNameType(NewClass, T);
	assert(T->isDependentType() && "Class template type is not dependent?");
	(void)T;

	// If we are providing an explicit specialization of a member that is a
	// class template, make a note of that.
	if (PrevClassTemplate &&
	PrevClassTemplate->getInstantiatedFromMemberTemplate())
	PrevClassTemplate->setMemberSpecialization();

	// Set the access specifier.
	if (!Invalid && TUK != TUK_Friend && NewTemplate->getDeclContext()->isRecord())
	SetMemberAccessSpecifier(NewTemplate, PrevClassTemplate, AS);

	// Set the lexical context of these templates
	NewClass->setLexicalDeclContext(CurContext);
	NewTemplate->setLexicalDeclContext(CurContext);

	if (TUK == TUK_Definition && (!SkipBody \|\| !SkipBody->ShouldSkip))
	NewClass->startDefinition();

	ProcessDeclAttributeList(S, NewClass, Attr);

	if (PrevClassTemplate)
	mergeDeclAttributes(NewClass, PrevClassTemplate->getTemplatedDecl());

	AddPushedVisibilityAttribute(NewClass);

	if (TUK != TUK_Friend) {
	// Per C++ [basic.scope.temp]p2, skip the template parameter scopes.
	Scope *Outer = S;
	while ((Outer->getFlags() & Scope::TemplateParamScope) != 0)
	Outer = Outer->getParent();
	PushOnScopeChains(NewTemplate, Outer);
	} else {
	if (PrevClassTemplate && PrevClassTemplate->getAccess() != AS_none) {
	NewTemplate->setAccess(PrevClassTemplate->getAccess());
	NewClass->setAccess(PrevClassTemplate->getAccess());
	}

	NewTemplate->setObjectOfFriendDecl();

	// Friend templates are visible in fairly strange ways.
	if (!CurContext->isDependentContext()) {
	DeclContext *DC = SemanticContext->getRedeclContext();
	DC->makeDeclVisibleInContext(NewTemplate);
	if (Scope *EnclosingScope = getScopeForDeclContext(S, DC))
	PushOnScopeChains(NewTemplate, EnclosingScope,
	/* AddToContext = */ false);
	}

	FriendDecl *Friend = FriendDecl::Create(
	Context, CurContext, NewClass->getLocation(), NewTemplate, FriendLoc);
	Friend->setAccess(AS_public);
	CurContext->addDecl(Friend);
	}

	if (PrevClassTemplate)
	CheckRedeclarationModuleOwnership(NewTemplate, PrevClassTemplate);

	if (Invalid) {
	NewTemplate->setInvalidDecl();
	NewClass->setInvalidDecl();
	}

	ActOnDocumentableDecl(NewTemplate);

	if (SkipBody && SkipBody->ShouldSkip)
	return SkipBody->Previous;

	return NewTemplate;
	}

	namespace {
	/// Tree transform to "extract" a transformed type from a class template's
	/// constructor to a deduction guide.
	class ExtractTypeForDeductionGuide
	: public TreeTransform<ExtractTypeForDeductionGuide> {
	public:
	typedef TreeTransform<ExtractTypeForDeductionGuide> Base;
	ExtractTypeForDeductionGuide(Sema &SemaRef) : Base(SemaRef) {}

	TypeSourceInfo transform(TypeSourceInfo TSI) { return TransformType(TSI); }

	QualType TransformTypedefType(TypeLocBuilder &TLB, TypedefTypeLoc TL) {
	return TransformType(
	TLB,
	TL.getTypedefNameDecl()->getTypeSourceInfo()->getTypeLoc());
	}
	};

	/// Transform to convert portions of a constructor declaration into the
	/// corresponding deduction guide, per C++1z [over.match.class.deduct]p1.
	struct ConvertConstructorToDeductionGuideTransform {
	ConvertConstructorToDeductionGuideTransform(Sema &S,
	ClassTemplateDecl *Template)
	: SemaRef(S), Template(Template) {}

	Sema &SemaRef;
	ClassTemplateDecl *Template;

	DeclContext *DC = Template->getDeclContext();
	CXXRecordDecl *Primary = Template->getTemplatedDecl();
	DeclarationName DeductionGuideName =
	SemaRef.Context.DeclarationNames.getCXXDeductionGuideName(Template);

	QualType DeducedType = SemaRef.Context.getTypeDeclType(Primary);

	// Index adjustment to apply to convert depth-1 template parameters into
	// depth-0 template parameters.
	unsigned Depth1IndexAdjustment = Template->getTemplateParameters()->size();

	/// Transform a constructor declaration into a deduction guide.
	NamedDecl transformConstructor(FunctionTemplateDecl FTD,
	CXXConstructorDecl *CD) {
	SmallVector<TemplateArgument, 16> SubstArgs;

	LocalInstantiationScope Scope(SemaRef);

	// C++ [over.match.class.deduct]p1:
	// -- For each constructor of the class template designated by the
	// template-name, a function template with the following properties:

	// -- The template parameters are the template parameters of the class
	// template followed by the template parameters (including default
	// template arguments) of the constructor, if any.
	TemplateParameterList *TemplateParams = Template->getTemplateParameters();
	if (FTD) {
	TemplateParameterList *InnerParams = FTD->getTemplateParameters();
	SmallVector<NamedDecl *, 16> AllParams;
	AllParams.reserve(TemplateParams->size() + InnerParams->size());
	AllParams.insert(AllParams.begin(),
	TemplateParams->begin(), TemplateParams->end());
	SubstArgs.reserve(InnerParams->size());

	// Later template parameters could refer to earlier ones, so build up
	// a list of substituted template arguments as we go.
	for (NamedDecl Param : InnerParams) {
	MultiLevelTemplateArgumentList Args;
	Args.addOuterTemplateArguments(SubstArgs);
	Args.addOuterRetainedLevel();
	NamedDecl *NewParam = transformTemplateParameter(Param, Args);
	if (!NewParam)
	return nullptr;
	AllParams.push_back(NewParam);
	SubstArgs.push_back(SemaRef.Context.getCanonicalTemplateArgument(
	SemaRef.Context.getInjectedTemplateArg(NewParam)));
	}
	TemplateParams = TemplateParameterList::Create(
	SemaRef.Context, InnerParams->getTemplateLoc(),
	InnerParams->getLAngleLoc(), AllParams, InnerParams->getRAngleLoc(),
	/FIXME: RequiresClause/ nullptr);
	}

	// If we built a new template-parameter-list, track that we need to
	// substitute references to the old parameters into references to the
	// new ones.
	MultiLevelTemplateArgumentList Args;
	if (FTD) {
	Args.addOuterTemplateArguments(SubstArgs);
	Args.addOuterRetainedLevel();
	}

	FunctionProtoTypeLoc FPTL = CD->getTypeSourceInfo()->getTypeLoc()
	.getAsAdjusted<FunctionProtoTypeLoc>();
	assert(FPTL && "no prototype for constructor declaration");

	// Transform the type of the function, adjusting the return type and
	// replacing references to the old parameters with references to the
	// new ones.
	TypeLocBuilder TLB;
	SmallVector<ParmVarDecl*, 8> Params;
	QualType NewType = transformFunctionProtoType(TLB, FPTL, Params, Args);
	if (NewType.isNull())
	return nullptr;
	TypeSourceInfo *NewTInfo = TLB.getTypeSourceInfo(SemaRef.Context, NewType);

	return buildDeductionGuide(TemplateParams, CD->getExplicitSpecifier(),
	NewTInfo, CD->getBeginLoc(), CD->getLocation(),
	CD->getEndLoc());
	}

	/// Build a deduction guide with the specified parameter types.
	NamedDecl *buildSimpleDeductionGuide(MutableArrayRef<QualType> ParamTypes) {
	SourceLocation Loc = Template->getLocation();

	// Build the requested type.
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.HasTrailingReturn = true;
	QualType Result = SemaRef.BuildFunctionType(DeducedType, ParamTypes, Loc,
	DeductionGuideName, EPI);
	TypeSourceInfo *TSI = SemaRef.Context.getTrivialTypeSourceInfo(Result, Loc);

	FunctionProtoTypeLoc FPTL =
	TSI->getTypeLoc().castAs<FunctionProtoTypeLoc>();

	// Build the parameters, needed during deduction / substitution.
	SmallVector<ParmVarDecl*, 4> Params;
	for (auto T : ParamTypes) {
	ParmVarDecl *NewParam = ParmVarDecl::Create(
	SemaRef.Context, DC, Loc, Loc, nullptr, T,
	SemaRef.Context.getTrivialTypeSourceInfo(T, Loc), SC_None, nullptr);
	NewParam->setScopeInfo(0, Params.size());
	FPTL.setParam(Params.size(), NewParam);
	Params.push_back(NewParam);
	}

	return buildDeductionGuide(Template->getTemplateParameters(),
	ExplicitSpecifier(), TSI, Loc, Loc, Loc);
	}

	private:
	/// Transform a constructor template parameter into a deduction guide template
	/// parameter, rebuilding any internal references to earlier parameters and
	/// renumbering as we go.
	NamedDecl transformTemplateParameter(NamedDecl TemplateParam,
	MultiLevelTemplateArgumentList &Args) {
	if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam)) {
	// TemplateTypeParmDecl's index cannot be changed after creation, so
	// substitute it directly.
	auto *NewTTP = TemplateTypeParmDecl::Create(
	SemaRef.Context, DC, TTP->getBeginLoc(), TTP->getLocation(),
	/Depth/ 0, Depth1IndexAdjustment + TTP->getIndex(),
	TTP->getIdentifier(), TTP->wasDeclaredWithTypename(),
	TTP->isParameterPack());
	if (TTP->hasDefaultArgument()) {
	TypeSourceInfo *InstantiatedDefaultArg =
	SemaRef.SubstType(TTP->getDefaultArgumentInfo(), Args,
	TTP->getDefaultArgumentLoc(), TTP->getDeclName());
	if (InstantiatedDefaultArg)
	NewTTP->setDefaultArgument(InstantiatedDefaultArg);
	}
	SemaRef.CurrentInstantiationScope->InstantiatedLocal(TemplateParam,
	NewTTP);
	return NewTTP;
	}

	if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
	return transformTemplateParameterImpl(TTP, Args);

	return transformTemplateParameterImpl(
	cast<NonTypeTemplateParmDecl>(TemplateParam), Args);
	}
	template<typename TemplateParmDecl>
	TemplateParmDecl *
	transformTemplateParameterImpl(TemplateParmDecl *OldParam,
	MultiLevelTemplateArgumentList &Args) {
	// Ask the template instantiator to do the heavy lifting for us, then adjust
	// the index of the parameter once it's done.
	auto *NewParam =
	cast_or_null<TemplateParmDecl>(SemaRef.SubstDecl(OldParam, DC, Args));
	assert(NewParam->getDepth() == 0 && "unexpected template param depth");
	NewParam->setPosition(NewParam->getPosition() + Depth1IndexAdjustment);
	return NewParam;
	}

	QualType transformFunctionProtoType(TypeLocBuilder &TLB,
	FunctionProtoTypeLoc TL,
	SmallVectorImpl<ParmVarDecl*> &Params,
	MultiLevelTemplateArgumentList &Args) {
	SmallVector<QualType, 4> ParamTypes;
	const FunctionProtoType *T = TL.getTypePtr();

	// -- The types of the function parameters are those of the constructor.
	for (auto *OldParam : TL.getParams()) {
	ParmVarDecl *NewParam = transformFunctionTypeParam(OldParam, Args);
	if (!NewParam)
	return QualType();
	ParamTypes.push_back(NewParam->getType());
	Params.push_back(NewParam);
	}

	// -- The return type is the class template specialization designated by
	// the template-name and template arguments corresponding to the
	// template parameters obtained from the class template.
	//
	// We use the injected-class-name type of the primary template instead.
	// This has the convenient property that it is different from any type that
	// the user can write in a deduction-guide (because they cannot enter the
	// context of the template), so implicit deduction guides can never collide
	// with explicit ones.
	QualType ReturnType = DeducedType;
	TLB.pushTypeSpec(ReturnType).setNameLoc(Primary->getLocation());

	// Resolving a wording defect, we also inherit the variadicness of the
	// constructor.
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.Variadic = T->isVariadic();
	EPI.HasTrailingReturn = true;

	QualType Result = SemaRef.BuildFunctionType(
	ReturnType, ParamTypes, TL.getBeginLoc(), DeductionGuideName, EPI);
	if (Result.isNull())
	return QualType();

	FunctionProtoTypeLoc NewTL = TLB.push<FunctionProtoTypeLoc>(Result);
	NewTL.setLocalRangeBegin(TL.getLocalRangeBegin());
	NewTL.setLParenLoc(TL.getLParenLoc());
	NewTL.setRParenLoc(TL.getRParenLoc());
	NewTL.setExceptionSpecRange(SourceRange());
	NewTL.setLocalRangeEnd(TL.getLocalRangeEnd());
	for (unsigned I = 0, E = NewTL.getNumParams(); I != E; ++I)
	NewTL.setParam(I, Params[I]);

	return Result;
	}

	ParmVarDecl *
	transformFunctionTypeParam(ParmVarDecl *OldParam,
	MultiLevelTemplateArgumentList &Args) {
	TypeSourceInfo *OldDI = OldParam->getTypeSourceInfo();
	TypeSourceInfo *NewDI;
	if (auto PackTL = OldDI->getTypeLoc().getAs<PackExpansionTypeLoc>()) {
	// Expand out the one and only element in each inner pack.
	Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, 0);
	NewDI =
	SemaRef.SubstType(PackTL.getPatternLoc(), Args,
	OldParam->getLocation(), OldParam->getDeclName());
	if (!NewDI) return nullptr;
	NewDI =
	SemaRef.CheckPackExpansion(NewDI, PackTL.getEllipsisLoc(),
	PackTL.getTypePtr()->getNumExpansions());
	} else
	NewDI = SemaRef.SubstType(OldDI, Args, OldParam->getLocation(),
	OldParam->getDeclName());
	if (!NewDI)
	return nullptr;

	// Extract the type. This (for instance) replaces references to typedef
	// members of the current instantiations with the definitions of those
	// typedefs, avoiding triggering instantiation of the deduced type during
	// deduction.
	NewDI = ExtractTypeForDeductionGuide(SemaRef).transform(NewDI);

	// Resolving a wording defect, we also inherit default arguments from the
	// constructor.
	ExprResult NewDefArg;
	if (OldParam->hasDefaultArg()) {
	NewDefArg = SemaRef.SubstExpr(OldParam->getDefaultArg(), Args);
	if (NewDefArg.isInvalid())
	return nullptr;
	}

	ParmVarDecl *NewParam = ParmVarDecl::Create(SemaRef.Context, DC,
	OldParam->getInnerLocStart(),
	OldParam->getLocation(),
	OldParam->getIdentifier(),
	NewDI->getType(),
	NewDI,
	OldParam->getStorageClass(),
	NewDefArg.get());
	NewParam->setScopeInfo(OldParam->getFunctionScopeDepth(),
	OldParam->getFunctionScopeIndex());
	SemaRef.CurrentInstantiationScope->InstantiatedLocal(OldParam, NewParam);
	return NewParam;
	}

	NamedDecl buildDeductionGuide(TemplateParameterList TemplateParams,
	ExplicitSpecifier ES, TypeSourceInfo *TInfo,
	SourceLocation LocStart, SourceLocation Loc,
	SourceLocation LocEnd) {
	DeclarationNameInfo Name(DeductionGuideName, Loc);
	ArrayRef<ParmVarDecl *> Params =
	TInfo->getTypeLoc().castAs<FunctionProtoTypeLoc>().getParams();

	// Build the implicit deduction guide template.
	auto *Guide =
	CXXDeductionGuideDecl::Create(SemaRef.Context, DC, LocStart, ES, Name,
	TInfo->getType(), TInfo, LocEnd);
	Guide->setImplicit();
	Guide->setParams(Params);

	for (auto *Param : Params)
	Param->setDeclContext(Guide);

	auto *GuideTemplate = FunctionTemplateDecl::Create(
	SemaRef.Context, DC, Loc, DeductionGuideName, TemplateParams, Guide);
	GuideTemplate->setImplicit();
	Guide->setDescribedFunctionTemplate(GuideTemplate);

	if (isa<CXXRecordDecl>(DC)) {
	Guide->setAccess(AS_public);
	GuideTemplate->setAccess(AS_public);
	}

	DC->addDecl(GuideTemplate);
	return GuideTemplate;
	}
	};
	}

	void Sema::DeclareImplicitDeductionGuides(TemplateDecl *Template,
	SourceLocation Loc) {
	if (CXXRecordDecl *DefRecord =
	cast<CXXRecordDecl>(Template->getTemplatedDecl())->getDefinition()) {
	TemplateDecl *DescribedTemplate = DefRecord->getDescribedClassTemplate();
	Template = DescribedTemplate ? DescribedTemplate : Template;
	}

	DeclContext *DC = Template->getDeclContext();
	if (DC->isDependentContext())
	return;

	ConvertConstructorToDeductionGuideTransform Transform(
	*this, cast<ClassTemplateDecl>(Template));
	if (!isCompleteType(Loc, Transform.DeducedType))
	return;

	// Check whether we've already declared deduction guides for this template.
	// FIXME: Consider storing a flag on the template to indicate this.
	auto Existing = DC->lookup(Transform.DeductionGuideName);
	for (auto *D : Existing)
	if (D->isImplicit())
	return;

	// In case we were expanding a pack when we attempted to declare deduction
	// guides, turn off pack expansion for everything we're about to do.
	ArgumentPackSubstitutionIndexRAII SubstIndex(*this, -1);
	// Create a template instantiation record to track the "instantiation" of
	// constructors into deduction guides.
	// FIXME: Add a kind for this to give more meaningful diagnostics. But can
	// this substitution process actually fail?
	InstantiatingTemplate BuildingDeductionGuides(*this, Loc, Template);
	if (BuildingDeductionGuides.isInvalid())
	return;

	// Convert declared constructors into deduction guide templates.
	// FIXME: Skip constructors for which deduction must necessarily fail (those
	// for which some class template parameter without a default argument never
	// appears in a deduced context).
	bool AddedAny = false;
	for (NamedDecl *D : LookupConstructors(Transform.Primary)) {
	D = D->getUnderlyingDecl();
	if (D->isInvalidDecl() \|\| D->isImplicit())
	continue;
	D = cast<NamedDecl>(D->getCanonicalDecl());

	auto *FTD = dyn_cast<FunctionTemplateDecl>(D);
	auto *CD =
	dyn_cast_or_null<CXXConstructorDecl>(FTD ? FTD->getTemplatedDecl() : D);
	// Class-scope explicit specializations (MS extension) do not result in
	// deduction guides.
	if (!CD \|\| (!FTD && CD->isFunctionTemplateSpecialization()))
	continue;

	Transform.transformConstructor(FTD, CD);
	AddedAny = true;
	}

	// C++17 [over.match.class.deduct]
	// -- If C is not defined or does not declare any constructors, an
	// additional function template derived as above from a hypothetical
	// constructor C().
	if (!AddedAny)
	Transform.buildSimpleDeductionGuide(None);

	// -- An additional function template derived as above from a hypothetical
	// constructor C(C), called the copy deduction candidate.
	cast<CXXDeductionGuideDecl>(
	cast<FunctionTemplateDecl>(
	Transform.buildSimpleDeductionGuide(Transform.DeducedType))
	->getTemplatedDecl())
	->setIsCopyDeductionCandidate();
	}

	/// Diagnose the presence of a default template argument on a
	/// template parameter, which is ill-formed in certain contexts.
	///
	/// \returns true if the default template argument should be dropped.
	static bool DiagnoseDefaultTemplateArgument(Sema &S,
	Sema::TemplateParamListContext TPC,
	SourceLocation ParamLoc,
	SourceRange DefArgRange) {
	switch (TPC) {
	case Sema::TPC_ClassTemplate:
	case Sema::TPC_VarTemplate:
	case Sema::TPC_TypeAliasTemplate:
	return false;

	case Sema::TPC_FunctionTemplate:
	case Sema::TPC_FriendFunctionTemplateDefinition:
	// C++ [temp.param]p9:
	// A default template-argument shall not be specified in a
	// function template declaration or a function template
	// definition [...]
	// If a friend function template declaration specifies a default
	// template-argument, that declaration shall be a definition and shall be
	// the only declaration of the function template in the translation unit.
	// (C++98/03 doesn't have this wording; see DR226).
	S.Diag(ParamLoc, S.getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_template_parameter_default_in_function_template
	: diag::ext_template_parameter_default_in_function_template)
	<< DefArgRange;
	return false;

	case Sema::TPC_ClassTemplateMember:
	// C++0x [temp.param]p9:
	// A default template-argument shall not be specified in the
	// template-parameter-lists of the definition of a member of a
	// class template that appears outside of the member's class.
	S.Diag(ParamLoc, diag::err_template_parameter_default_template_member)
	<< DefArgRange;
	return true;

	case Sema::TPC_FriendClassTemplate:
	case Sema::TPC_FriendFunctionTemplate:
	// C++ [temp.param]p9:
	// A default template-argument shall not be specified in a
	// friend template declaration.
	S.Diag(ParamLoc, diag::err_template_parameter_default_friend_template)
	<< DefArgRange;
	return true;

	// FIXME: C++0x [temp.param]p9 allows default template-arguments
	// for friend function templates if there is only a single
	// declaration (and it is a definition). Strange!
	}

	llvm_unreachable("Invalid TemplateParamListContext!");
	}

	/// Check for unexpanded parameter packs within the template parameters
	/// of a template template parameter, recursively.
	static bool DiagnoseUnexpandedParameterPacks(Sema &S,
	TemplateTemplateParmDecl *TTP) {
	// A template template parameter which is a parameter pack is also a pack
	// expansion.
	if (TTP->isParameterPack())
	return false;

	TemplateParameterList *Params = TTP->getTemplateParameters();
	for (unsigned I = 0, N = Params->size(); I != N; ++I) {
	NamedDecl *P = Params->getParam(I);
	if (NonTypeTemplateParmDecl *NTTP = dyn_cast<NonTypeTemplateParmDecl>(P)) {
	if (!NTTP->isParameterPack() &&
	S.DiagnoseUnexpandedParameterPack(NTTP->getLocation(),
	NTTP->getTypeSourceInfo(),
	Sema::UPPC_NonTypeTemplateParameterType))
	return true;

	continue;
	}

	if (TemplateTemplateParmDecl *InnerTTP
	= dyn_cast<TemplateTemplateParmDecl>(P))
	if (DiagnoseUnexpandedParameterPacks(S, InnerTTP))
	return true;
	}

	return false;
	}

	/// Checks the validity of a template parameter list, possibly
	/// considering the template parameter list from a previous
	/// declaration.
	///
	/// If an "old" template parameter list is provided, it must be
	/// equivalent (per TemplateParameterListsAreEqual) to the "new"
	/// template parameter list.
	///
	/// \param NewParams Template parameter list for a new template
	/// declaration. This template parameter list will be updated with any
	/// default arguments that are carried through from the previous
	/// template parameter list.
	///
	/// \param OldParams If provided, template parameter list from a
	/// previous declaration of the same template. Default template
	/// arguments will be merged from the old template parameter list to
	/// the new template parameter list.
	///
	/// \param TPC Describes the context in which we are checking the given
	/// template parameter list.
	///
	/// \param SkipBody If we might have already made a prior merged definition
	/// of this template visible, the corresponding body-skipping information.
	/// Default argument redefinition is not an error when skipping such a body,
	/// because (under the ODR) we can assume the default arguments are the same
	/// as the prior merged definition.
	///
	/// \returns true if an error occurred, false otherwise.
	bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams,
	TemplateParameterList *OldParams,
	TemplateParamListContext TPC,
	SkipBodyInfo *SkipBody) {
	bool Invalid = false;

	// C++ [temp.param]p10:
	// The set of default template-arguments available for use with a
	// template declaration or definition is obtained by merging the
	// default arguments from the definition (if in scope) and all
	// declarations in scope in the same way default function
	// arguments are (8.3.6).
	bool SawDefaultArgument = false;
	SourceLocation PreviousDefaultArgLoc;

	// Dummy initialization to avoid warnings.
	TemplateParameterList::iterator OldParam = NewParams->end();
	if (OldParams)
	OldParam = OldParams->begin();

	bool RemoveDefaultArguments = false;
	for (TemplateParameterList::iterator NewParam = NewParams->begin(),
	NewParamEnd = NewParams->end();
	NewParam != NewParamEnd; ++NewParam) {
	// Variables used to diagnose redundant default arguments
	bool RedundantDefaultArg = false;
	SourceLocation OldDefaultLoc;
	SourceLocation NewDefaultLoc;

	// Variable used to diagnose missing default arguments
	bool MissingDefaultArg = false;

	// Variable used to diagnose non-final parameter packs
	bool SawParameterPack = false;

	if (TemplateTypeParmDecl *NewTypeParm
	= dyn_cast<TemplateTypeParmDecl>(*NewParam)) {
	// Check the presence of a default argument here.
	if (NewTypeParm->hasDefaultArgument() &&
	DiagnoseDefaultTemplateArgument(*this, TPC,
	NewTypeParm->getLocation(),
	NewTypeParm->getDefaultArgumentInfo()->getTypeLoc()
	.getSourceRange()))
	NewTypeParm->removeDefaultArgument();

	// Merge default arguments for template type parameters.
	TemplateTypeParmDecl *OldTypeParm
	= OldParams? cast<TemplateTypeParmDecl>(*OldParam) : nullptr;
	if (NewTypeParm->isParameterPack()) {
	assert(!NewTypeParm->hasDefaultArgument() &&
	"Parameter packs can't have a default argument!");
	SawParameterPack = true;
	} else if (OldTypeParm && hasVisibleDefaultArgument(OldTypeParm) &&
	NewTypeParm->hasDefaultArgument() &&
	(!SkipBody \|\| !SkipBody->ShouldSkip)) {
	OldDefaultLoc = OldTypeParm->getDefaultArgumentLoc();
	NewDefaultLoc = NewTypeParm->getDefaultArgumentLoc();
	SawDefaultArgument = true;
	RedundantDefaultArg = true;
	PreviousDefaultArgLoc = NewDefaultLoc;
	} else if (OldTypeParm && OldTypeParm->hasDefaultArgument()) {
	// Merge the default argument from the old declaration to the
	// new declaration.
	NewTypeParm->setInheritedDefaultArgument(Context, OldTypeParm);
	PreviousDefaultArgLoc = OldTypeParm->getDefaultArgumentLoc();
	} else if (NewTypeParm->hasDefaultArgument()) {
	SawDefaultArgument = true;
	PreviousDefaultArgLoc = NewTypeParm->getDefaultArgumentLoc();
	} else if (SawDefaultArgument)
	MissingDefaultArg = true;
	} else if (NonTypeTemplateParmDecl *NewNonTypeParm
	= dyn_cast<NonTypeTemplateParmDecl>(*NewParam)) {
	// Check for unexpanded parameter packs.
	if (!NewNonTypeParm->isParameterPack() &&
	DiagnoseUnexpandedParameterPack(NewNonTypeParm->getLocation(),
	NewNonTypeParm->getTypeSourceInfo(),
	UPPC_NonTypeTemplateParameterType)) {
	Invalid = true;
	continue;
	}

	// Check the presence of a default argument here.
	if (NewNonTypeParm->hasDefaultArgument() &&
	DiagnoseDefaultTemplateArgument(*this, TPC,
	NewNonTypeParm->getLocation(),
	NewNonTypeParm->getDefaultArgument()->getSourceRange())) {
	NewNonTypeParm->removeDefaultArgument();
	}

	// Merge default arguments for non-type template parameters
	NonTypeTemplateParmDecl *OldNonTypeParm
	= OldParams? cast<NonTypeTemplateParmDecl>(*OldParam) : nullptr;
	if (NewNonTypeParm->isParameterPack()) {
	assert(!NewNonTypeParm->hasDefaultArgument() &&
	"Parameter packs can't have a default argument!");
	if (!NewNonTypeParm->isPackExpansion())
	SawParameterPack = true;
	} else if (OldNonTypeParm && hasVisibleDefaultArgument(OldNonTypeParm) &&
	NewNonTypeParm->hasDefaultArgument() &&
	(!SkipBody \|\| !SkipBody->ShouldSkip)) {
	OldDefaultLoc = OldNonTypeParm->getDefaultArgumentLoc();
	NewDefaultLoc = NewNonTypeParm->getDefaultArgumentLoc();
	SawDefaultArgument = true;
	RedundantDefaultArg = true;
	PreviousDefaultArgLoc = NewDefaultLoc;
	} else if (OldNonTypeParm && OldNonTypeParm->hasDefaultArgument()) {
	// Merge the default argument from the old declaration to the
	// new declaration.
	NewNonTypeParm->setInheritedDefaultArgument(Context, OldNonTypeParm);
	PreviousDefaultArgLoc = OldNonTypeParm->getDefaultArgumentLoc();
	} else if (NewNonTypeParm->hasDefaultArgument()) {
	SawDefaultArgument = true;
	PreviousDefaultArgLoc = NewNonTypeParm->getDefaultArgumentLoc();
	} else if (SawDefaultArgument)
	MissingDefaultArg = true;
	} else {
	TemplateTemplateParmDecl *NewTemplateParm
	= cast<TemplateTemplateParmDecl>(*NewParam);

	// Check for unexpanded parameter packs, recursively.
	if (::DiagnoseUnexpandedParameterPacks(*this, NewTemplateParm)) {
	Invalid = true;
	continue;
	}

	// Check the presence of a default argument here.
	if (NewTemplateParm->hasDefaultArgument() &&
	DiagnoseDefaultTemplateArgument(*this, TPC,
	NewTemplateParm->getLocation(),
	NewTemplateParm->getDefaultArgument().getSourceRange()))
	NewTemplateParm->removeDefaultArgument();

	// Merge default arguments for template template parameters
	TemplateTemplateParmDecl *OldTemplateParm
	= OldParams? cast<TemplateTemplateParmDecl>(*OldParam) : nullptr;
	if (NewTemplateParm->isParameterPack()) {
	assert(!NewTemplateParm->hasDefaultArgument() &&
	"Parameter packs can't have a default argument!");
	if (!NewTemplateParm->isPackExpansion())
	SawParameterPack = true;
	} else if (OldTemplateParm &&
	hasVisibleDefaultArgument(OldTemplateParm) &&
	NewTemplateParm->hasDefaultArgument() &&
	(!SkipBody \|\| !SkipBody->ShouldSkip)) {
	OldDefaultLoc = OldTemplateParm->getDefaultArgument().getLocation();
	NewDefaultLoc = NewTemplateParm->getDefaultArgument().getLocation();
	SawDefaultArgument = true;
	RedundantDefaultArg = true;
	PreviousDefaultArgLoc = NewDefaultLoc;
	} else if (OldTemplateParm && OldTemplateParm->hasDefaultArgument()) {
	// Merge the default argument from the old declaration to the
	// new declaration.
	NewTemplateParm->setInheritedDefaultArgument(Context, OldTemplateParm);
	PreviousDefaultArgLoc
	= OldTemplateParm->getDefaultArgument().getLocation();
	} else if (NewTemplateParm->hasDefaultArgument()) {
	SawDefaultArgument = true;
	PreviousDefaultArgLoc
	= NewTemplateParm->getDefaultArgument().getLocation();
	} else if (SawDefaultArgument)
	MissingDefaultArg = true;
	}

	// C++11 [temp.param]p11:
	// If a template parameter of a primary class template or alias template
	// is a template parameter pack, it shall be the last template parameter.
	if (SawParameterPack && (NewParam + 1) != NewParamEnd &&
	(TPC == TPC_ClassTemplate \|\| TPC == TPC_VarTemplate \|\|
	TPC == TPC_TypeAliasTemplate)) {
	Diag((*NewParam)->getLocation(),
	diag::err_template_param_pack_must_be_last_template_parameter);
	Invalid = true;
	}

	if (RedundantDefaultArg) {
	// C++ [temp.param]p12:
	// A template-parameter shall not be given default arguments
	// by two different declarations in the same scope.
	Diag(NewDefaultLoc, diag::err_template_param_default_arg_redefinition);
	Diag(OldDefaultLoc, diag::note_template_param_prev_default_arg);
	Invalid = true;
	} else if (MissingDefaultArg && TPC != TPC_FunctionTemplate) {
	// C++ [temp.param]p11:
	// If a template-parameter of a class template has a default
	// template-argument, each subsequent template-parameter shall either
	// have a default template-argument supplied or be a template parameter
	// pack.
	Diag((*NewParam)->getLocation(),
	diag::err_template_param_default_arg_missing);
	Diag(PreviousDefaultArgLoc, diag::note_template_param_prev_default_arg);
	Invalid = true;
	RemoveDefaultArguments = true;
	}

	// If we have an old template parameter list that we're merging
	// in, move on to the next parameter.
	if (OldParams)
	++OldParam;
	}

	// We were missing some default arguments at the end of the list, so remove
	// all of the default arguments.
	if (RemoveDefaultArguments) {
	for (TemplateParameterList::iterator NewParam = NewParams->begin(),
	NewParamEnd = NewParams->end();
	NewParam != NewParamEnd; ++NewParam) {
	if (TemplateTypeParmDecl TTP = dyn_cast<TemplateTypeParmDecl>(NewParam))
	TTP->removeDefaultArgument();
	else if (NonTypeTemplateParmDecl *NTTP
	= dyn_cast<NonTypeTemplateParmDecl>(*NewParam))
	NTTP->removeDefaultArgument();
	else
	cast<TemplateTemplateParmDecl>(*NewParam)->removeDefaultArgument();
	}
	}

	return Invalid;
	}

	namespace {

	/// A class which looks for a use of a certain level of template
	/// parameter.
	struct DependencyChecker : RecursiveASTVisitor<DependencyChecker> {
	typedef RecursiveASTVisitor<DependencyChecker> super;

	unsigned Depth;

	// Whether we're looking for a use of a template parameter that makes the
	// overall construct type-dependent / a dependent type. This is strictly
	// best-effort for now; we may fail to match at all for a dependent type
	// in some cases if this is set.
	bool IgnoreNonTypeDependent;

	bool Match;
	SourceLocation MatchLoc;

	DependencyChecker(unsigned Depth, bool IgnoreNonTypeDependent)
	: Depth(Depth), IgnoreNonTypeDependent(IgnoreNonTypeDependent),
	Match(false) {}

	DependencyChecker(TemplateParameterList *Params, bool IgnoreNonTypeDependent)
	: IgnoreNonTypeDependent(IgnoreNonTypeDependent), Match(false) {
	NamedDecl *ND = Params->getParam(0);
	if (TemplateTypeParmDecl *PD = dyn_cast<TemplateTypeParmDecl>(ND)) {
	Depth = PD->getDepth();
	} else if (NonTypeTemplateParmDecl *PD =
	dyn_cast<NonTypeTemplateParmDecl>(ND)) {
	Depth = PD->getDepth();
	} else {
	Depth = cast<TemplateTemplateParmDecl>(ND)->getDepth();
	}
	}

	bool Matches(unsigned ParmDepth, SourceLocation Loc = SourceLocation()) {
	if (ParmDepth >= Depth) {
	Match = true;
	MatchLoc = Loc;
	return true;
	}
	return false;
	}

	bool TraverseStmt(Stmt S, DataRecursionQueue Q = nullptr) {
	// Prune out non-type-dependent expressions if requested. This can
	// sometimes result in us failing to find a template parameter reference
	// (if a value-dependent expression creates a dependent type), but this
	// mode is best-effort only.
	if (auto *E = dyn_cast_or_null<Expr>(S))
	if (IgnoreNonTypeDependent && !E->isTypeDependent())
	return true;
	return super::TraverseStmt(S, Q);
	}

	bool TraverseTypeLoc(TypeLoc TL) {
	if (IgnoreNonTypeDependent && !TL.isNull() &&
	!TL.getType()->isDependentType())
	return true;
	return super::TraverseTypeLoc(TL);
	}

	bool VisitTemplateTypeParmTypeLoc(TemplateTypeParmTypeLoc TL) {
	return !Matches(TL.getTypePtr()->getDepth(), TL.getNameLoc());
	}

	bool VisitTemplateTypeParmType(const TemplateTypeParmType *T) {
	// For a best-effort search, keep looking until we find a location.
	return IgnoreNonTypeDependent \|\| !Matches(T->getDepth());
	}

	bool TraverseTemplateName(TemplateName N) {
	if (TemplateTemplateParmDecl *PD =
	dyn_cast_or_null<TemplateTemplateParmDecl>(N.getAsTemplateDecl()))
	if (Matches(PD->getDepth()))
	return false;
	return super::TraverseTemplateName(N);
	}

	bool VisitDeclRefExpr(DeclRefExpr *E) {
	if (NonTypeTemplateParmDecl *PD =
	dyn_cast<NonTypeTemplateParmDecl>(E->getDecl()))
	if (Matches(PD->getDepth(), E->getExprLoc()))
	return false;
	return super::VisitDeclRefExpr(E);
	}

	bool VisitSubstTemplateTypeParmType(const SubstTemplateTypeParmType *T) {
	return TraverseType(T->getReplacementType());
	}

	bool
	VisitSubstTemplateTypeParmPackType(const SubstTemplateTypeParmPackType *T) {
	return TraverseTemplateArgument(T->getArgumentPack());
	}

	bool TraverseInjectedClassNameType(const InjectedClassNameType *T) {
	return TraverseType(T->getInjectedSpecializationType());
	}
	};
	} // end anonymous namespace

	/// Determines whether a given type depends on the given parameter
	/// list.
	static bool
	DependsOnTemplateParameters(QualType T, TemplateParameterList *Params) {
	DependencyChecker Checker(Params, /IgnoreNonTypeDependent/false);
	Checker.TraverseType(T);
	return Checker.Match;
	}

	// Find the source range corresponding to the named type in the given
	// nested-name-specifier, if any.
	static SourceRange getRangeOfTypeInNestedNameSpecifier(ASTContext &Context,
	QualType T,
	const CXXScopeSpec &SS) {
	NestedNameSpecifierLoc NNSLoc(SS.getScopeRep(), SS.location_data());
	while (NestedNameSpecifier *NNS = NNSLoc.getNestedNameSpecifier()) {
	if (const Type *CurType = NNS->getAsType()) {
	if (Context.hasSameUnqualifiedType(T, QualType(CurType, 0)))
	return NNSLoc.getTypeLoc().getSourceRange();
	} else
	break;

	NNSLoc = NNSLoc.getPrefix();
	}

	return SourceRange();
	}

	/// Match the given template parameter lists to the given scope
	/// specifier, returning the template parameter list that applies to the
	/// name.
	///
	/// \param DeclStartLoc the start of the declaration that has a scope
	/// specifier or a template parameter list.
	///
	/// \param DeclLoc The location of the declaration itself.
	///
	/// \param SS the scope specifier that will be matched to the given template
	/// parameter lists. This scope specifier precedes a qualified name that is
	/// being declared.
	///
	/// \param TemplateId The template-id following the scope specifier, if there
	/// is one. Used to check for a missing 'template<>'.
	///
	/// \param ParamLists the template parameter lists, from the outermost to the
	/// innermost template parameter lists.
	///
	/// \param IsFriend Whether to apply the slightly different rules for
	/// matching template parameters to scope specifiers in friend
	/// declarations.
	///
	/// \param IsMemberSpecialization will be set true if the scope specifier
	/// denotes a fully-specialized type, and therefore this is a declaration of
	/// a member specialization.
	///
	/// \returns the template parameter list, if any, that corresponds to the
	/// name that is preceded by the scope specifier @p SS. This template
	/// parameter list may have template parameters (if we're declaring a
	/// template) or may have no template parameters (if we're declaring a
	/// template specialization), or may be NULL (if what we're declaring isn't
	/// itself a template).
	TemplateParameterList *Sema::MatchTemplateParametersToScopeSpecifier(
	SourceLocation DeclStartLoc, SourceLocation DeclLoc, const CXXScopeSpec &SS,
	TemplateIdAnnotation *TemplateId,
	ArrayRef<TemplateParameterList *> ParamLists, bool IsFriend,
	bool &IsMemberSpecialization, bool &Invalid) {
	IsMemberSpecialization = false;
	Invalid = false;

	// The sequence of nested types to which we will match up the template
	// parameter lists. We first build this list by starting with the type named
	// by the nested-name-specifier and walking out until we run out of types.
	SmallVector<QualType, 4> NestedTypes;
	QualType T;
	if (SS.getScopeRep()) {
	if (CXXRecordDecl *Record
	= dyn_cast_or_null<CXXRecordDecl>(computeDeclContext(SS, true)))
	T = Context.getTypeDeclType(Record);
	else
	T = QualType(SS.getScopeRep()->getAsType(), 0);
	}

	// If we found an explicit specialization that prevents us from needing
	// 'template<>' headers, this will be set to the location of that
	// explicit specialization.
	SourceLocation ExplicitSpecLoc;

	while (!T.isNull()) {
	NestedTypes.push_back(T);

	// Retrieve the parent of a record type.
	if (CXXRecordDecl *Record = T->getAsCXXRecordDecl()) {
	// If this type is an explicit specialization, we're done.
	if (ClassTemplateSpecializationDecl *Spec
	= dyn_cast<ClassTemplateSpecializationDecl>(Record)) {
	if (!isa<ClassTemplatePartialSpecializationDecl>(Spec) &&
	Spec->getSpecializationKind() == TSK_ExplicitSpecialization) {
	ExplicitSpecLoc = Spec->getLocation();
	break;
	}
	} else if (Record->getTemplateSpecializationKind()
	== TSK_ExplicitSpecialization) {
	ExplicitSpecLoc = Record->getLocation();
	break;
	}

	if (TypeDecl *Parent = dyn_cast<TypeDecl>(Record->getParent()))
	T = Context.getTypeDeclType(Parent);
	else
	T = QualType();
	continue;
	}

	if (const TemplateSpecializationType *TST
	= T->getAs<TemplateSpecializationType>()) {
	if (TemplateDecl *Template = TST->getTemplateName().getAsTemplateDecl()) {
	if (TypeDecl *Parent = dyn_cast<TypeDecl>(Template->getDeclContext()))
	T = Context.getTypeDeclType(Parent);
	else
	T = QualType();
	continue;
	}
	}

	// Look one step prior in a dependent template specialization type.
	if (const DependentTemplateSpecializationType *DependentTST
	= T->getAs<DependentTemplateSpecializationType>()) {
	if (NestedNameSpecifier *NNS = DependentTST->getQualifier())
	T = QualType(NNS->getAsType(), 0);
	else
	T = QualType();
	continue;
	}

	// Look one step prior in a dependent name type.
	if (const DependentNameType *DependentName = T->getAs<DependentNameType>()){
	if (NestedNameSpecifier *NNS = DependentName->getQualifier())
	T = QualType(NNS->getAsType(), 0);
	else
	T = QualType();
	continue;
	}

	// Retrieve the parent of an enumeration type.
	if (const EnumType *EnumT = T->getAs<EnumType>()) {
	// FIXME: Forward-declared enums require a TSK_ExplicitSpecialization
	// check here.
	EnumDecl *Enum = EnumT->getDecl();

	// Get to the parent type.
	if (TypeDecl *Parent = dyn_cast<TypeDecl>(Enum->getParent()))
	T = Context.getTypeDeclType(Parent);
	else
	T = QualType();
	continue;
	}

	T = QualType();
	}
	// Reverse the nested types list, since we want to traverse from the outermost
	// to the innermost while checking template-parameter-lists.
	std::reverse(NestedTypes.begin(), NestedTypes.end());

	// C++0x [temp.expl.spec]p17:
	// A member or a member template may be nested within many
	// enclosing class templates. In an explicit specialization for
	// such a member, the member declaration shall be preceded by a
	// template<> for each enclosing class template that is
	// explicitly specialized.
	bool SawNonEmptyTemplateParameterList = false;

	auto CheckExplicitSpecialization = [&](SourceRange Range, bool Recovery) {
	if (SawNonEmptyTemplateParameterList) {
	Diag(DeclLoc, diag::err_specialize_member_of_template)
	<< !Recovery << Range;
	Invalid = true;
	IsMemberSpecialization = false;
	return true;
	}

	return false;
	};

	auto DiagnoseMissingExplicitSpecialization = [&] (SourceRange Range) {
	// Check that we can have an explicit specialization here.
	if (CheckExplicitSpecialization(Range, true))
	return true;

	// We don't have a template header, but we should.
	SourceLocation ExpectedTemplateLoc;
	if (!ParamLists.empty())
	ExpectedTemplateLoc = ParamLists[0]->getTemplateLoc();
	else
	ExpectedTemplateLoc = DeclStartLoc;

	Diag(DeclLoc, diag::err_template_spec_needs_header)
	<< Range
	<< FixItHint::CreateInsertion(ExpectedTemplateLoc, "template<> ");
	return false;
	};

	unsigned ParamIdx = 0;
	for (unsigned TypeIdx = 0, NumTypes = NestedTypes.size(); TypeIdx != NumTypes;
	++TypeIdx) {
	T = NestedTypes[TypeIdx];

	// Whether we expect a 'template<>' header.
	bool NeedEmptyTemplateHeader = false;

	// Whether we expect a template header with parameters.
	bool NeedNonemptyTemplateHeader = false;

	// For a dependent type, the set of template parameters that we
	// expect to see.
	TemplateParameterList *ExpectedTemplateParams = nullptr;

	// C++0x [temp.expl.spec]p15:
	// A member or a member template may be nested within many enclosing
	// class templates. In an explicit specialization for such a member, the
	// member declaration shall be preceded by a template<> for each
	// enclosing class template that is explicitly specialized.
	if (CXXRecordDecl *Record = T->getAsCXXRecordDecl()) {
	if (ClassTemplatePartialSpecializationDecl *Partial
	= dyn_cast<ClassTemplatePartialSpecializationDecl>(Record)) {
	ExpectedTemplateParams = Partial->getTemplateParameters();
	NeedNonemptyTemplateHeader = true;
	} else if (Record->isDependentType()) {
	if (Record->getDescribedClassTemplate()) {
	ExpectedTemplateParams = Record->getDescribedClassTemplate()
	->getTemplateParameters();
	NeedNonemptyTemplateHeader = true;
	}
	} else if (ClassTemplateSpecializationDecl *Spec
	= dyn_cast<ClassTemplateSpecializationDecl>(Record)) {
	// C++0x [temp.expl.spec]p4:
	// Members of an explicitly specialized class template are defined
	// in the same manner as members of normal classes, and not using
	// the template<> syntax.
	if (Spec->getSpecializationKind() != TSK_ExplicitSpecialization)
	NeedEmptyTemplateHeader = true;
	else
	continue;
	} else if (Record->getTemplateSpecializationKind()) {
	if (Record->getTemplateSpecializationKind()
	!= TSK_ExplicitSpecialization &&
	TypeIdx == NumTypes - 1)
	IsMemberSpecialization = true;

	continue;
	}
	} else if (const TemplateSpecializationType *TST
	= T->getAs<TemplateSpecializationType>()) {
	if (TemplateDecl *Template = TST->getTemplateName().getAsTemplateDecl()) {
	ExpectedTemplateParams = Template->getTemplateParameters();
	NeedNonemptyTemplateHeader = true;
	}
	} else if (T->getAs<DependentTemplateSpecializationType>()) {
	// FIXME: We actually could/should check the template arguments here
	// against the corresponding template parameter list.
	NeedNonemptyTemplateHeader = false;
	}

	// C++ [temp.expl.spec]p16:
	// In an explicit specialization declaration for a member of a class
	// template or a member template that ap- pears in namespace scope, the
	// member template and some of its enclosing class templates may remain
	// unspecialized, except that the declaration shall not explicitly
	// specialize a class member template if its en- closing class templates
	// are not explicitly specialized as well.
	if (ParamIdx < ParamLists.size()) {
	if (ParamLists[ParamIdx]->size() == 0) {
	if (CheckExplicitSpecialization(ParamLists[ParamIdx]->getSourceRange(),
	false))
	return nullptr;
	} else
	SawNonEmptyTemplateParameterList = true;
	}

	if (NeedEmptyTemplateHeader) {
	// If we're on the last of the types, and we need a 'template<>' header
	// here, then it's a member specialization.
	if (TypeIdx == NumTypes - 1)
	IsMemberSpecialization = true;

	if (ParamIdx < ParamLists.size()) {
	if (ParamLists[ParamIdx]->size() > 0) {
	// The header has template parameters when it shouldn't. Complain.
	Diag(ParamLists[ParamIdx]->getTemplateLoc(),
	diag::err_template_param_list_matches_nontemplate)
	<< T
	<< SourceRange(ParamLists[ParamIdx]->getLAngleLoc(),
	ParamLists[ParamIdx]->getRAngleLoc())
	<< getRangeOfTypeInNestedNameSpecifier(Context, T, SS);
	Invalid = true;
	return nullptr;
	}

	// Consume this template header.
	++ParamIdx;
	continue;
	}

	if (!IsFriend)
	if (DiagnoseMissingExplicitSpecialization(
	getRangeOfTypeInNestedNameSpecifier(Context, T, SS)))
	return nullptr;

	continue;
	}

	if (NeedNonemptyTemplateHeader) {
	// In friend declarations we can have template-ids which don't
	// depend on the corresponding template parameter lists. But
	// assume that empty parameter lists are supposed to match this
	// template-id.
	if (IsFriend && T->isDependentType()) {
	if (ParamIdx < ParamLists.size() &&
	DependsOnTemplateParameters(T, ParamLists[ParamIdx]))
	ExpectedTemplateParams = nullptr;
	else
	continue;
	}

	if (ParamIdx < ParamLists.size()) {
	// Check the template parameter list, if we can.
	if (ExpectedTemplateParams &&
	!TemplateParameterListsAreEqual(ParamLists[ParamIdx],
	ExpectedTemplateParams,
	true, TPL_TemplateMatch))
	Invalid = true;

	if (!Invalid &&
	CheckTemplateParameterList(ParamLists[ParamIdx], nullptr,
	TPC_ClassTemplateMember))
	Invalid = true;

	++ParamIdx;
	continue;
	}

	Diag(DeclLoc, diag::err_template_spec_needs_template_parameters)
	<< T
	<< getRangeOfTypeInNestedNameSpecifier(Context, T, SS);
	Invalid = true;
	continue;
	}
	}

	// If there were at least as many template-ids as there were template
	// parameter lists, then there are no template parameter lists remaining for
	// the declaration itself.
	if (ParamIdx >= ParamLists.size()) {
	if (TemplateId && !IsFriend) {
	// We don't have a template header for the declaration itself, but we
	// should.
	DiagnoseMissingExplicitSpecialization(SourceRange(TemplateId->LAngleLoc,
	TemplateId->RAngleLoc));

	// Fabricate an empty template parameter list for the invented header.
	return TemplateParameterList::Create(Context, SourceLocation(),
	SourceLocation(), None,
	SourceLocation(), nullptr);
	}

	return nullptr;
	}

	// If there were too many template parameter lists, complain about that now.
	if (ParamIdx < ParamLists.size() - 1) {
	bool HasAnyExplicitSpecHeader = false;
	bool AllExplicitSpecHeaders = true;
	for (unsigned I = ParamIdx, E = ParamLists.size() - 1; I != E; ++I) {
	if (ParamLists[I]->size() == 0)
	HasAnyExplicitSpecHeader = true;
	else
	AllExplicitSpecHeaders = false;
	}

	Diag(ParamLists[ParamIdx]->getTemplateLoc(),
	AllExplicitSpecHeaders ? diag::warn_template_spec_extra_headers
	: diag::err_template_spec_extra_headers)
	<< SourceRange(ParamLists[ParamIdx]->getTemplateLoc(),
	ParamLists[ParamLists.size() - 2]->getRAngleLoc());

	// If there was a specialization somewhere, such that 'template<>' is
	// not required, and there were any 'template<>' headers, note where the
	// specialization occurred.
	if (ExplicitSpecLoc.isValid() && HasAnyExplicitSpecHeader)
	Diag(ExplicitSpecLoc,
	diag::note_explicit_template_spec_does_not_need_header)
	<< NestedTypes.back();

	// We have a template parameter list with no corresponding scope, which
	// means that the resulting template declaration can't be instantiated
	// properly (we'll end up with dependent nodes when we shouldn't).
	if (!AllExplicitSpecHeaders)
	Invalid = true;
	}

	// C++ [temp.expl.spec]p16:
	// In an explicit specialization declaration for a member of a class
	// template or a member template that ap- pears in namespace scope, the
	// member template and some of its enclosing class templates may remain
	// unspecialized, except that the declaration shall not explicitly
	// specialize a class member template if its en- closing class templates
	// are not explicitly specialized as well.
	if (ParamLists.back()->size() == 0 &&
	CheckExplicitSpecialization(ParamLists[ParamIdx]->getSourceRange(),
	false))
	return nullptr;

	// Return the last template parameter list, which corresponds to the
	// entity being declared.
	return ParamLists.back();
	}

	void Sema::NoteAllFoundTemplates(TemplateName Name) {
	if (TemplateDecl *Template = Name.getAsTemplateDecl()) {
	Diag(Template->getLocation(), diag::note_template_declared_here)
	<< (isa<FunctionTemplateDecl>(Template)
	? 0
	: isa<ClassTemplateDecl>(Template)
	? 1
	: isa<VarTemplateDecl>(Template)
	? 2
	: isa<TypeAliasTemplateDecl>(Template) ? 3 : 4)
	<< Template->getDeclName();
	return;
	}

	if (OverloadedTemplateStorage *OST = Name.getAsOverloadedTemplate()) {
	for (OverloadedTemplateStorage::iterator I = OST->begin(),
	IEnd = OST->end();
	I != IEnd; ++I)
	Diag((*I)->getLocation(), diag::note_template_declared_here)
	<< 0 << (*I)->getDeclName();

	return;
	}
	}

	static QualType
	checkBuiltinTemplateIdType(Sema &SemaRef, BuiltinTemplateDecl *BTD,
	const SmallVectorImpl<TemplateArgument> &Converted,
	SourceLocation TemplateLoc,
	TemplateArgumentListInfo &TemplateArgs) {
	ASTContext &Context = SemaRef.getASTContext();
	switch (BTD->getBuiltinTemplateKind()) {
	case BTK__make_integer_seq: {
	// Specializations of __make_integer_seq<S, T, N> are treated like
	// S<T, 0, ..., N-1>.

	// C++14 [inteseq.intseq]p1:
	// T shall be an integer type.
	if (!Converted[1].getAsType()->isIntegralType(Context)) {
	SemaRef.Diag(TemplateArgs[1].getLocation(),
	diag::err_integer_sequence_integral_element_type);
	return QualType();
	}

	// C++14 [inteseq.make]p1:
	// If N is negative the program is ill-formed.
	TemplateArgument NumArgsArg = Converted[2];
	llvm::APSInt NumArgs = NumArgsArg.getAsIntegral();
	if (NumArgs < 0) {
	SemaRef.Diag(TemplateArgs[2].getLocation(),
	diag::err_integer_sequence_negative_length);
	return QualType();
	}

	QualType ArgTy = NumArgsArg.getIntegralType();
	TemplateArgumentListInfo SyntheticTemplateArgs;
	// The type argument gets reused as the first template argument in the
	// synthetic template argument list.
	SyntheticTemplateArgs.addArgument(TemplateArgs[1]);
	// Expand N into 0 ... N-1.
	for (llvm::APSInt I(NumArgs.getBitWidth(), NumArgs.isUnsigned());
	I < NumArgs; ++I) {
	TemplateArgument TA(Context, I, ArgTy);
	SyntheticTemplateArgs.addArgument(SemaRef.getTrivialTemplateArgumentLoc(
	TA, ArgTy, TemplateArgs[2].getLocation()));
	}
	// The first template argument will be reused as the template decl that
	// our synthetic template arguments will be applied to.
	return SemaRef.CheckTemplateIdType(Converted[0].getAsTemplate(),
	TemplateLoc, SyntheticTemplateArgs);
	}

	case BTK__type_pack_element:
	// Specializations of
	// __type_pack_element<Index, T_1, ..., T_N>
	// are treated like T_Index.
	assert(Converted.size() == 2 &&
	"__type_pack_element should be given an index and a parameter pack");

	// If the Index is out of bounds, the program is ill-formed.
	TemplateArgument IndexArg = Converted[0], Ts = Converted[1];
	llvm::APSInt Index = IndexArg.getAsIntegral();
	assert(Index >= 0 && "the index used with __type_pack_element should be of "
	"type std::size_t, and hence be non-negative");
	if (Index >= Ts.pack_size()) {
	SemaRef.Diag(TemplateArgs[0].getLocation(),
	diag::err_type_pack_element_out_of_bounds);
	return QualType();
	}

	// We simply return the type at index `Index`.
	auto Nth = std::next(Ts.pack_begin(), Index.getExtValue());
	return Nth->getAsType();
	}
	llvm_unreachable("unexpected BuiltinTemplateDecl!");
	}

	/// Determine whether this alias template is "enable_if_t".
	static bool isEnableIfAliasTemplate(TypeAliasTemplateDecl *AliasTemplate) {
	return AliasTemplate->getName().equals("enable_if_t");
	}

	/// Collect all of the separable terms in the given condition, which
	/// might be a conjunction.
	///
	/// FIXME: The right answer is to convert the logical expression into
	/// disjunctive normal form, so we can find the first failed term
	/// within each possible clause.
	static void collectConjunctionTerms(Expr *Clause,
	SmallVectorImpl<Expr *> &Terms) {
	if (auto BinOp = dyn_cast<BinaryOperator>(Clause->IgnoreParenImpCasts())) {
	if (BinOp->getOpcode() == BO_LAnd) {
	collectConjunctionTerms(BinOp->getLHS(), Terms);
	collectConjunctionTerms(BinOp->getRHS(), Terms);
	}

	return;
	}

	Terms.push_back(Clause);
	}

	// The ranges-v3 library uses an odd pattern of a top-level "\|\|" with
	// a left-hand side that is value-dependent but never true. Identify
	// the idiom and ignore that term.
	static Expr lookThroughRangesV3Condition(Preprocessor &PP, Expr Cond) {
	// Top-level '\|\|'.
	auto *BinOp = dyn_cast<BinaryOperator>(Cond->IgnoreParenImpCasts());
	if (!BinOp) return Cond;

	if (BinOp->getOpcode() != BO_LOr) return Cond;

	// With an inner '==' that has a literal on the right-hand side.
	Expr *LHS = BinOp->getLHS();
	auto *InnerBinOp = dyn_cast<BinaryOperator>(LHS->IgnoreParenImpCasts());
	if (!InnerBinOp) return Cond;

	if (InnerBinOp->getOpcode() != BO_EQ \|\|
	!isa<IntegerLiteral>(InnerBinOp->getRHS()))
	return Cond;

	// If the inner binary operation came from a macro expansion named
	// CONCEPT_REQUIRES or CONCEPT_REQUIRES_, return the right-hand side
	// of the '\|\|', which is the real, user-provided condition.
	SourceLocation Loc = InnerBinOp->getExprLoc();
	if (!Loc.isMacroID()) return Cond;

	StringRef MacroName = PP.getImmediateMacroName(Loc);
	if (MacroName == "CONCEPT_REQUIRES" \|\| MacroName == "CONCEPT_REQUIRES_")
	return BinOp->getRHS();

	return Cond;
	}

	namespace {

	// A PrinterHelper that prints more helpful diagnostics for some sub-expressions
	// within failing boolean expression, such as substituting template parameters
	// for actual types.
	class FailedBooleanConditionPrinterHelper : public PrinterHelper {
	public:
	explicit FailedBooleanConditionPrinterHelper(const PrintingPolicy &P)
	: Policy(P) {}

	bool handledStmt(Stmt *E, raw_ostream &OS) override {
	const auto *DR = dyn_cast<DeclRefExpr>(E);
	if (DR && DR->getQualifier()) {
	// If this is a qualified name, expand the template arguments in nested
	// qualifiers.
	DR->getQualifier()->print(OS, Policy, true);
	// Then print the decl itself.
	const ValueDecl *VD = DR->getDecl();
	OS << VD->getName();
	if (const auto *IV = dyn_cast<VarTemplateSpecializationDecl>(VD)) {
	// This is a template variable, print the expanded template arguments.
	printTemplateArgumentList(OS, IV->getTemplateArgs().asArray(), Policy);
	}
	return true;
	}
	return false;
	}

	private:
	const PrintingPolicy Policy;
	};

	} // end anonymous namespace

	std::pair<Expr *, std::string>
	Sema::findFailedBooleanCondition(Expr *Cond) {
	Cond = lookThroughRangesV3Condition(PP, Cond);

	// Separate out all of the terms in a conjunction.
	SmallVector<Expr *, 4> Terms;
	collectConjunctionTerms(Cond, Terms);

	// Determine which term failed.
	Expr *FailedCond = nullptr;
	for (Expr *Term : Terms) {
	Expr *TermAsWritten = Term->IgnoreParenImpCasts();

	// Literals are uninteresting.
	if (isa<CXXBoolLiteralExpr>(TermAsWritten) \|\|
	isa<IntegerLiteral>(TermAsWritten))
	continue;

	// The initialization of the parameter from the argument is
	// a constant-evaluated context.
	EnterExpressionEvaluationContext ConstantEvaluated(
	*this, Sema::ExpressionEvaluationContext::ConstantEvaluated);

	bool Succeeded;
	if (Term->EvaluateAsBooleanCondition(Succeeded, Context) &&
	!Succeeded) {
	FailedCond = TermAsWritten;
	break;
	}
	}
	if (!FailedCond)
	FailedCond = Cond->IgnoreParenImpCasts();

	std::string Description;
	{
	llvm::raw_string_ostream Out(Description);
	PrintingPolicy Policy = getPrintingPolicy();
	Policy.PrintCanonicalTypes = true;
	FailedBooleanConditionPrinterHelper Helper(Policy);
	FailedCond->printPretty(Out, &Helper, Policy, 0, "\n", nullptr);
	}
	return { FailedCond, Description };
	}

	QualType Sema::CheckTemplateIdType(TemplateName Name,
	SourceLocation TemplateLoc,
	TemplateArgumentListInfo &TemplateArgs) {
	DependentTemplateName *DTN
	= Name.getUnderlying().getAsDependentTemplateName();
	if (DTN && DTN->isIdentifier())
	// When building a template-id where the template-name is dependent,
	// assume the template is a type template. Either our assumption is
	// correct, or the code is ill-formed and will be diagnosed when the
	// dependent name is substituted.
	return Context.getDependentTemplateSpecializationType(ETK_None,
	DTN->getQualifier(),
	DTN->getIdentifier(),
	TemplateArgs);

	TemplateDecl *Template = Name.getAsTemplateDecl();
	if (!Template \|\| isa<FunctionTemplateDecl>(Template) \|\|
	isa<VarTemplateDecl>(Template) \|\|
	isa<ConceptDecl>(Template)) {
	// We might have a substituted template template parameter pack. If so,
	// build a template specialization type for it.
	if (Name.getAsSubstTemplateTemplateParmPack())
	return Context.getTemplateSpecializationType(Name, TemplateArgs);

	Diag(TemplateLoc, diag::err_template_id_not_a_type)
	<< Name;
	NoteAllFoundTemplates(Name);
	return QualType();
	}

	// Check that the template argument list is well-formed for this
	// template.
	SmallVector<TemplateArgument, 4> Converted;
	if (CheckTemplateArgumentList(Template, TemplateLoc, TemplateArgs,
	false, Converted))
	return QualType();

	QualType CanonType;

	bool InstantiationDependent = false;
	if (TypeAliasTemplateDecl *AliasTemplate =
	dyn_cast<TypeAliasTemplateDecl>(Template)) {
	// Find the canonical type for this type alias template specialization.
	TypeAliasDecl *Pattern = AliasTemplate->getTemplatedDecl();
	if (Pattern->isInvalidDecl())
	return QualType();

	TemplateArgumentList StackTemplateArgs(TemplateArgumentList::OnStack,
	Converted);

	// Only substitute for the innermost template argument list.
	MultiLevelTemplateArgumentList TemplateArgLists;
	TemplateArgLists.addOuterTemplateArguments(&StackTemplateArgs);
	unsigned Depth = AliasTemplate->getTemplateParameters()->getDepth();
	for (unsigned I = 0; I < Depth; ++I)
	TemplateArgLists.addOuterTemplateArguments(None);

	LocalInstantiationScope Scope(*this);
	InstantiatingTemplate Inst(*this, TemplateLoc, Template);
	if (Inst.isInvalid())
	return QualType();

	CanonType = SubstType(Pattern->getUnderlyingType(),
	TemplateArgLists, AliasTemplate->getLocation(),
	AliasTemplate->getDeclName());
	if (CanonType.isNull()) {
	// If this was enable_if and we failed to find the nested type
	// within enable_if in a SFINAE context, dig out the specific
	// enable_if condition that failed and present that instead.
	if (isEnableIfAliasTemplate(AliasTemplate)) {
	if (auto DeductionInfo = isSFINAEContext()) {
	if (*DeductionInfo &&
	(*DeductionInfo)->hasSFINAEDiagnostic() &&
	(*DeductionInfo)->peekSFINAEDiagnostic().second.getDiagID() ==
	diag::err_typename_nested_not_found_enable_if &&
	TemplateArgs[0].getArgument().getKind()
	== TemplateArgument::Expression) {
	Expr *FailedCond;
	std::string FailedDescription;
	std::tie(FailedCond, FailedDescription) =
	findFailedBooleanCondition(TemplateArgs[0].getSourceExpression());

	// Remove the old SFINAE diagnostic.
	PartialDiagnosticAt OldDiag =
	{SourceLocation(), PartialDiagnostic::NullDiagnostic()};
	(*DeductionInfo)->takeSFINAEDiagnostic(OldDiag);

	// Add a new SFINAE diagnostic specifying which condition
	// failed.
	(*DeductionInfo)->addSFINAEDiagnostic(
	OldDiag.first,
	PDiag(diag::err_typename_nested_not_found_requirement)
	<< FailedDescription
	<< FailedCond->getSourceRange());
	}
	}
	}

	return QualType();
	}
	} else if (Name.isDependent() \|\|
	TemplateSpecializationType::anyDependentTemplateArguments(
	TemplateArgs, InstantiationDependent)) {
	// This class template specialization is a dependent
	// type. Therefore, its canonical type is another class template
	// specialization type that contains all of the converted
	// arguments in canonical form. This ensures that, e.g., A<T> and
	// A<T, T> have identical types when A is declared as:
	//
	// template<typename T, typename U = T> struct A;
	CanonType = Context.getCanonicalTemplateSpecializationType(Name, Converted);

	// This might work out to be a current instantiation, in which
	// case the canonical type needs to be the InjectedClassNameType.
	//
	// TODO: in theory this could be a simple hashtable lookup; most
	// changes to CurContext don't change the set of current
	// instantiations.
	if (isa<ClassTemplateDecl>(Template)) {
	for (DeclContext *Ctx = CurContext; Ctx; Ctx = Ctx->getLookupParent()) {
	// If we get out to a namespace, we're done.
	if (Ctx->isFileContext()) break;

	// If this isn't a record, keep looking.
	CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(Ctx);
	if (!Record) continue;

	// Look for one of the two cases with InjectedClassNameTypes
	// and check whether it's the same template.
	if (!isa<ClassTemplatePartialSpecializationDecl>(Record) &&
	!Record->getDescribedClassTemplate())
	continue;

	// Fetch the injected class name type and check whether its
	// injected type is equal to the type we just built.
	QualType ICNT = Context.getTypeDeclType(Record);
	QualType Injected = cast<InjectedClassNameType>(ICNT)
	->getInjectedSpecializationType();

	if (CanonType != Injected->getCanonicalTypeInternal())
	continue;

	// If so, the canonical type of this TST is the injected
	// class name type of the record we just found.
	assert(ICNT.isCanonical());
	CanonType = ICNT;
	break;
	}
	}
	} else if (ClassTemplateDecl *ClassTemplate
	= dyn_cast<ClassTemplateDecl>(Template)) {
	// Find the class template specialization declaration that
	// corresponds to these arguments.
	void *InsertPos = nullptr;
	ClassTemplateSpecializationDecl *Decl
	= ClassTemplate->findSpecialization(Converted, InsertPos);
	if (!Decl) {
	// This is the first time we have referenced this class template
	// specialization. Create the canonical declaration and add it to
	// the set of specializations.
	Decl = ClassTemplateSpecializationDecl::Create(
	Context, ClassTemplate->getTemplatedDecl()->getTagKind(),
	ClassTemplate->getDeclContext(),
	ClassTemplate->getTemplatedDecl()->getBeginLoc(),
	ClassTemplate->getLocation(), ClassTemplate, Converted, nullptr);
	ClassTemplate->AddSpecialization(Decl, InsertPos);
	if (ClassTemplate->isOutOfLine())
	Decl->setLexicalDeclContext(ClassTemplate->getLexicalDeclContext());
	}

	if (Decl->getSpecializationKind() == TSK_Undeclared) {
	MultiLevelTemplateArgumentList TemplateArgLists;
	TemplateArgLists.addOuterTemplateArguments(Converted);
	InstantiateAttrsForDecl(TemplateArgLists, ClassTemplate->getTemplatedDecl(),
	Decl);
	}

	// Diagnose uses of this specialization.
	(void)DiagnoseUseOfDecl(Decl, TemplateLoc);

	CanonType = Context.getTypeDeclType(Decl);
	assert(isa<RecordType>(CanonType) &&
	"type of non-dependent specialization is not a RecordType");
	} else if (auto *BTD = dyn_cast<BuiltinTemplateDecl>(Template)) {
	CanonType = checkBuiltinTemplateIdType(*this, BTD, Converted, TemplateLoc,
	TemplateArgs);
	}

	// Build the fully-sugared type for this class template
	// specialization, which refers back to the class template
	// specialization we created or found.
	return Context.getTemplateSpecializationType(Name, TemplateArgs, CanonType);
	}

	void Sema::ActOnUndeclaredTypeTemplateName(Scope *S, TemplateTy &ParsedName,
	TemplateNameKind &TNK,
	SourceLocation NameLoc,
	IdentifierInfo *&II) {
	assert(TNK == TNK_Undeclared_template && "not an undeclared template name");

	TemplateName Name = ParsedName.get();
	auto *ATN = Name.getAsAssumedTemplateName();
	assert(ATN && "not an assumed template name");
	II = ATN->getDeclName().getAsIdentifierInfo();

	if (!resolveAssumedTemplateNameAsType(S, Name, NameLoc, /Diagnose/false)) {
	// Resolved to a type template name.
	ParsedName = TemplateTy::make(Name);
	TNK = TNK_Type_template;
	}
	}

	bool Sema::resolveAssumedTemplateNameAsType(Scope *S, TemplateName &Name,
	SourceLocation NameLoc,
	bool Diagnose) {
	// We assumed this undeclared identifier to be an (ADL-only) function
	// template name, but it was used in a context where a type was required.
	// Try to typo-correct it now.
	AssumedTemplateStorage *ATN = Name.getAsAssumedTemplateName();
	assert(ATN && "not an assumed template name");

	LookupResult R(*this, ATN->getDeclName(), NameLoc, LookupOrdinaryName);
	struct CandidateCallback : CorrectionCandidateCallback {
	bool ValidateCandidate(const TypoCorrection &TC) override {
	return TC.getCorrectionDecl() &&
	getAsTypeTemplateDecl(TC.getCorrectionDecl());
	}
	std::unique_ptr<CorrectionCandidateCallback> clone() override {
	return llvm::make_unique<CandidateCallback>(*this);
	}
	} FilterCCC;

	TypoCorrection Corrected =
	CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), S, nullptr,
	FilterCCC, CTK_ErrorRecovery);
	if (Corrected && Corrected.getFoundDecl()) {
	diagnoseTypo(Corrected, PDiag(diag::err_no_template_suggest)
	<< ATN->getDeclName());
	Name = TemplateName(Corrected.getCorrectionDeclAs<TemplateDecl>());
	return false;
	}

	if (Diagnose)
	Diag(R.getNameLoc(), diag::err_no_template) << R.getLookupName();
	return true;
	}

	TypeResult Sema::ActOnTemplateIdType(
	Scope *S, CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
	TemplateTy TemplateD, IdentifierInfo *TemplateII,
	SourceLocation TemplateIILoc, SourceLocation LAngleLoc,
	ASTTemplateArgsPtr TemplateArgsIn, SourceLocation RAngleLoc,
	bool IsCtorOrDtorName, bool IsClassName) {
	if (SS.isInvalid())
	return true;

	if (!IsCtorOrDtorName && !IsClassName && SS.isSet()) {
	DeclContext LookupCtx = computeDeclContext(SS, /EnteringContext*/false);

	// C++ [temp.res]p3:
	// A qualified-id that refers to a type and in which the
	// nested-name-specifier depends on a template-parameter (14.6.2)
	// shall be prefixed by the keyword typename to indicate that the
	// qualified-id denotes a type, forming an
	// elaborated-type-specifier (7.1.5.3).
	if (!LookupCtx && isDependentScopeSpecifier(SS)) {
	Diag(SS.getBeginLoc(), diag::err_typename_missing_template)
	<< SS.getScopeRep() << TemplateII->getName();
	// Recover as if 'typename' were specified.
	// FIXME: This is not quite correct recovery as we don't transform SS
	// into the corresponding dependent form (and we don't diagnose missing
	// 'template' keywords within SS as a result).
	return ActOnTypenameType(nullptr, SourceLocation(), SS, TemplateKWLoc,
	TemplateD, TemplateII, TemplateIILoc, LAngleLoc,
	TemplateArgsIn, RAngleLoc);
	}

	// Per C++ [class.qual]p2, if the template-id was an injected-class-name,
	// it's not actually allowed to be used as a type in most cases. Because
	// we annotate it before we know whether it's valid, we have to check for
	// this case here.
	auto *LookupRD = dyn_cast_or_null<CXXRecordDecl>(LookupCtx);
	if (LookupRD && LookupRD->getIdentifier() == TemplateII) {
	Diag(TemplateIILoc,
	TemplateKWLoc.isInvalid()
	? diag::err_out_of_line_qualified_id_type_names_constructor
	: diag::ext_out_of_line_qualified_id_type_names_constructor)
	<< TemplateII << 0 /injected-class-name used as template name/
	<< 1 /if any keyword was present, it was 'template'/;
	}
	}

	TemplateName Template = TemplateD.get();
	if (Template.getAsAssumedTemplateName() &&
	resolveAssumedTemplateNameAsType(S, Template, TemplateIILoc))
	return true;

	// Translate the parser's template argument list in our AST format.
	TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
	translateTemplateArguments(TemplateArgsIn, TemplateArgs);

	if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) {
	QualType T
	= Context.getDependentTemplateSpecializationType(ETK_None,
	DTN->getQualifier(),
	DTN->getIdentifier(),
	TemplateArgs);
	// Build type-source information.
	TypeLocBuilder TLB;
	DependentTemplateSpecializationTypeLoc SpecTL
	= TLB.push<DependentTemplateSpecializationTypeLoc>(T);
	SpecTL.setElaboratedKeywordLoc(SourceLocation());
	SpecTL.setQualifierLoc(SS.getWithLocInContext(Context));
	SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
	SpecTL.setTemplateNameLoc(TemplateIILoc);
	SpecTL.setLAngleLoc(LAngleLoc);
	SpecTL.setRAngleLoc(RAngleLoc);
	for (unsigned I = 0, N = SpecTL.getNumArgs(); I != N; ++I)
	SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());
	return CreateParsedType(T, TLB.getTypeSourceInfo(Context, T));
	}

	QualType Result = CheckTemplateIdType(Template, TemplateIILoc, TemplateArgs);
	if (Result.isNull())
	return true;

	// Build type-source information.
	TypeLocBuilder TLB;
	TemplateSpecializationTypeLoc SpecTL
	= TLB.push<TemplateSpecializationTypeLoc>(Result);
	SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
	SpecTL.setTemplateNameLoc(TemplateIILoc);
	SpecTL.setLAngleLoc(LAngleLoc);
	SpecTL.setRAngleLoc(RAngleLoc);
	for (unsigned i = 0, e = SpecTL.getNumArgs(); i != e; ++i)
	SpecTL.setArgLocInfo(i, TemplateArgs[i].getLocInfo());

	// NOTE: avoid constructing an ElaboratedTypeLoc if this is a
	// constructor or destructor name (in such a case, the scope specifier
	// will be attached to the enclosing Decl or Expr node).
	if (SS.isNotEmpty() && !IsCtorOrDtorName) {
	// Create an elaborated-type-specifier containing the nested-name-specifier.
	Result = Context.getElaboratedType(ETK_None, SS.getScopeRep(), Result);
	ElaboratedTypeLoc ElabTL = TLB.push<ElaboratedTypeLoc>(Result);
	ElabTL.setElaboratedKeywordLoc(SourceLocation());
	ElabTL.setQualifierLoc(SS.getWithLocInContext(Context));
	}

	return CreateParsedType(Result, TLB.getTypeSourceInfo(Context, Result));
	}

	TypeResult Sema::ActOnTagTemplateIdType(TagUseKind TUK,
	TypeSpecifierType TagSpec,
	SourceLocation TagLoc,
	CXXScopeSpec &SS,
	SourceLocation TemplateKWLoc,
	TemplateTy TemplateD,
	SourceLocation TemplateLoc,
	SourceLocation LAngleLoc,
	ASTTemplateArgsPtr TemplateArgsIn,
	SourceLocation RAngleLoc) {
	TemplateName Template = TemplateD.get();

	// Translate the parser's template argument list in our AST format.
	TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
	translateTemplateArguments(TemplateArgsIn, TemplateArgs);

	// Determine the tag kind
	TagTypeKind TagKind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
	ElaboratedTypeKeyword Keyword
	= TypeWithKeyword::getKeywordForTagTypeKind(TagKind);

	if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) {
	QualType T = Context.getDependentTemplateSpecializationType(Keyword,
	DTN->getQualifier(),
	DTN->getIdentifier(),
	TemplateArgs);

	// Build type-source information.
	TypeLocBuilder TLB;
	DependentTemplateSpecializationTypeLoc SpecTL
	= TLB.push<DependentTemplateSpecializationTypeLoc>(T);
	SpecTL.setElaboratedKeywordLoc(TagLoc);
	SpecTL.setQualifierLoc(SS.getWithLocInContext(Context));
	SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
	SpecTL.setTemplateNameLoc(TemplateLoc);
	SpecTL.setLAngleLoc(LAngleLoc);
	SpecTL.setRAngleLoc(RAngleLoc);
	for (unsigned I = 0, N = SpecTL.getNumArgs(); I != N; ++I)
	SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());
	return CreateParsedType(T, TLB.getTypeSourceInfo(Context, T));
	}

	if (TypeAliasTemplateDecl *TAT =
	dyn_cast_or_null<TypeAliasTemplateDecl>(Template.getAsTemplateDecl())) {
	// C++0x [dcl.type.elab]p2:
	// If the identifier resolves to a typedef-name or the simple-template-id
	// resolves to an alias template specialization, the
	// elaborated-type-specifier is ill-formed.
	Diag(TemplateLoc, diag::err_tag_reference_non_tag)
	<< TAT << NTK_TypeAliasTemplate << TagKind;
	Diag(TAT->getLocation(), diag::note_declared_at);
	}

	QualType Result = CheckTemplateIdType(Template, TemplateLoc, TemplateArgs);
	if (Result.isNull())
	return TypeResult(true);

	// Check the tag kind
	if (const RecordType *RT = Result->getAs<RecordType>()) {
	RecordDecl *D = RT->getDecl();

	IdentifierInfo *Id = D->getIdentifier();
	assert(Id && "templated class must have an identifier");

	if (!isAcceptableTagRedeclaration(D, TagKind, TUK == TUK_Definition,
	TagLoc, Id)) {
	Diag(TagLoc, diag::err_use_with_wrong_tag)
	<< Result
	<< FixItHint::CreateReplacement(SourceRange(TagLoc), D->getKindName());
	Diag(D->getLocation(), diag::note_previous_use);
	}
	}

	// Provide source-location information for the template specialization.
	TypeLocBuilder TLB;
	TemplateSpecializationTypeLoc SpecTL
	= TLB.push<TemplateSpecializationTypeLoc>(Result);
	SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
	SpecTL.setTemplateNameLoc(TemplateLoc);
	SpecTL.setLAngleLoc(LAngleLoc);
	SpecTL.setRAngleLoc(RAngleLoc);
	for (unsigned i = 0, e = SpecTL.getNumArgs(); i != e; ++i)
	SpecTL.setArgLocInfo(i, TemplateArgs[i].getLocInfo());

	// Construct an elaborated type containing the nested-name-specifier (if any)
	// and tag keyword.
	Result = Context.getElaboratedType(Keyword, SS.getScopeRep(), Result);
	ElaboratedTypeLoc ElabTL = TLB.push<ElaboratedTypeLoc>(Result);
	ElabTL.setElaboratedKeywordLoc(TagLoc);
	ElabTL.setQualifierLoc(SS.getWithLocInContext(Context));
	return CreateParsedType(Result, TLB.getTypeSourceInfo(Context, Result));
	}

	static bool CheckTemplateSpecializationScope(Sema &S, NamedDecl *Specialized,
	NamedDecl *PrevDecl,
	SourceLocation Loc,
	bool IsPartialSpecialization);

	static TemplateSpecializationKind getTemplateSpecializationKind(Decl *D);

	static bool isTemplateArgumentTemplateParameter(
	const TemplateArgument &Arg, unsigned Depth, unsigned Index) {
	switch (Arg.getKind()) {
	case TemplateArgument::Null:
	case TemplateArgument::NullPtr:
	case TemplateArgument::Integral:
	case TemplateArgument::Declaration:
	case TemplateArgument::Pack:
	case TemplateArgument::TemplateExpansion:
	return false;

	case TemplateArgument::Type: {
	QualType Type = Arg.getAsType();
	const TemplateTypeParmType *TPT =
	Arg.getAsType()->getAs<TemplateTypeParmType>();
	return TPT && !Type.hasQualifiers() &&
	TPT->getDepth() == Depth && TPT->getIndex() == Index;
	}

	case TemplateArgument::Expression: {
	DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(Arg.getAsExpr());
	if (!DRE \|\| !DRE->getDecl())
	return false;
	const NonTypeTemplateParmDecl *NTTP =
	dyn_cast<NonTypeTemplateParmDecl>(DRE->getDecl());
	return NTTP && NTTP->getDepth() == Depth && NTTP->getIndex() == Index;
	}

	case TemplateArgument::Template:
	const TemplateTemplateParmDecl *TTP =
	dyn_cast_or_null<TemplateTemplateParmDecl>(
	Arg.getAsTemplateOrTemplatePattern().getAsTemplateDecl());
	return TTP && TTP->getDepth() == Depth && TTP->getIndex() == Index;
	}
	llvm_unreachable("unexpected kind of template argument");
	}

	static bool isSameAsPrimaryTemplate(TemplateParameterList *Params,
	ArrayRef<TemplateArgument> Args) {
	if (Params->size() != Args.size())
	return false;

	unsigned Depth = Params->getDepth();

	for (unsigned I = 0, N = Args.size(); I != N; ++I) {
	TemplateArgument Arg = Args[I];

	// If the parameter is a pack expansion, the argument must be a pack
	// whose only element is a pack expansion.
	if (Params->getParam(I)->isParameterPack()) {
	if (Arg.getKind() != TemplateArgument::Pack \|\| Arg.pack_size() != 1 \|\|
	!Arg.pack_begin()->isPackExpansion())
	return false;
	Arg = Arg.pack_begin()->getPackExpansionPattern();
	}

	if (!isTemplateArgumentTemplateParameter(Arg, Depth, I))
	return false;
	}

	return true;
	}

	/// Convert the parser's template argument list representation into our form.
	static TemplateArgumentListInfo
	makeTemplateArgumentListInfo(Sema &S, TemplateIdAnnotation &TemplateId) {
	TemplateArgumentListInfo TemplateArgs(TemplateId.LAngleLoc,
	TemplateId.RAngleLoc);
	ASTTemplateArgsPtr TemplateArgsPtr(TemplateId.getTemplateArgs(),
	TemplateId.NumArgs);
	S.translateTemplateArguments(TemplateArgsPtr, TemplateArgs);
	return TemplateArgs;
	}

	template<typename PartialSpecDecl>
	static void checkMoreSpecializedThanPrimary(Sema &S, PartialSpecDecl *Partial) {
	if (Partial->getDeclContext()->isDependentContext())
	return;

	// FIXME: Get the TDK from deduction in order to provide better diagnostics
	// for non-substitution-failure issues?
	TemplateDeductionInfo Info(Partial->getLocation());
	if (S.isMoreSpecializedThanPrimary(Partial, Info))
	return;

	auto *Template = Partial->getSpecializedTemplate();
	S.Diag(Partial->getLocation(),
	diag::ext_partial_spec_not_more_specialized_than_primary)
	<< isa<VarTemplateDecl>(Template);

	if (Info.hasSFINAEDiagnostic()) {
	PartialDiagnosticAt Diag = {SourceLocation(),
	PartialDiagnostic::NullDiagnostic()};
	Info.takeSFINAEDiagnostic(Diag);
	SmallString<128> SFINAEArgString;
	Diag.second.EmitToString(S.getDiagnostics(), SFINAEArgString);
	S.Diag(Diag.first,
	diag::note_partial_spec_not_more_specialized_than_primary)
	<< SFINAEArgString;
	}

	S.Diag(Template->getLocation(), diag::note_template_decl_here);
	}

	static void
	noteNonDeducibleParameters(Sema &S, TemplateParameterList *TemplateParams,
	const llvm::SmallBitVector &DeducibleParams) {
	for (unsigned I = 0, N = DeducibleParams.size(); I != N; ++I) {
	if (!DeducibleParams[I]) {
	NamedDecl *Param = TemplateParams->getParam(I);
	if (Param->getDeclName())
	S.Diag(Param->getLocation(), diag::note_non_deducible_parameter)
	<< Param->getDeclName();
	else
	S.Diag(Param->getLocation(), diag::note_non_deducible_parameter)
	<< "(anonymous)";
	}
	}
	}


	template<typename PartialSpecDecl>
	static void checkTemplatePartialSpecialization(Sema &S,
	PartialSpecDecl *Partial) {
	// C++1z [temp.class.spec]p8: (DR1495)
	// - The specialization shall be more specialized than the primary
	// template (14.5.5.2).
	checkMoreSpecializedThanPrimary(S, Partial);

	// C++ [temp.class.spec]p8: (DR1315)
	// - Each template-parameter shall appear at least once in the
	// template-id outside a non-deduced context.
	// C++1z [temp.class.spec.match]p3 (P0127R2)
	// If the template arguments of a partial specialization cannot be
	// deduced because of the structure of its template-parameter-list
	// and the template-id, the program is ill-formed.
	auto *TemplateParams = Partial->getTemplateParameters();
	llvm::SmallBitVector DeducibleParams(TemplateParams->size());
	S.MarkUsedTemplateParameters(Partial->getTemplateArgs(), true,
	TemplateParams->getDepth(), DeducibleParams);

	if (!DeducibleParams.all()) {
	unsigned NumNonDeducible = DeducibleParams.size() - DeducibleParams.count();
	S.Diag(Partial->getLocation(), diag::ext_partial_specs_not_deducible)
	<< isa<VarTemplatePartialSpecializationDecl>(Partial)
	<< (NumNonDeducible > 1)
	<< SourceRange(Partial->getLocation(),
	Partial->getTemplateArgsAsWritten()->RAngleLoc);
	noteNonDeducibleParameters(S, TemplateParams, DeducibleParams);
	}
	}

	void Sema::CheckTemplatePartialSpecialization(
	ClassTemplatePartialSpecializationDecl *Partial) {
	checkTemplatePartialSpecialization(*this, Partial);
	}

	void Sema::CheckTemplatePartialSpecialization(
	VarTemplatePartialSpecializationDecl *Partial) {
	checkTemplatePartialSpecialization(*this, Partial);
	}

	void Sema::CheckDeductionGuideTemplate(FunctionTemplateDecl *TD) {
	// C++1z [temp.param]p11:
	// A template parameter of a deduction guide template that does not have a
	// default-argument shall be deducible from the parameter-type-list of the
	// deduction guide template.
	auto *TemplateParams = TD->getTemplateParameters();
	llvm::SmallBitVector DeducibleParams(TemplateParams->size());
	MarkDeducedTemplateParameters(TD, DeducibleParams);
	for (unsigned I = 0; I != TemplateParams->size(); ++I) {
	// A parameter pack is deducible (to an empty pack).
	auto *Param = TemplateParams->getParam(I);
	if (Param->isParameterPack() \|\| hasVisibleDefaultArgument(Param))
	DeducibleParams[I] = true;
	}

	if (!DeducibleParams.all()) {
	unsigned NumNonDeducible = DeducibleParams.size() - DeducibleParams.count();
	Diag(TD->getLocation(), diag::err_deduction_guide_template_not_deducible)
	<< (NumNonDeducible > 1);
	noteNonDeducibleParameters(*this, TemplateParams, DeducibleParams);
	}
	}

	DeclResult Sema::ActOnVarTemplateSpecialization(
	Scope S, Declarator &D, TypeSourceInfo DI, SourceLocation TemplateKWLoc,
	TemplateParameterList *TemplateParams, StorageClass SC,
	bool IsPartialSpecialization) {
	// D must be variable template id.
	assert(D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId &&
	"Variable template specialization is declared with a template it.");

	TemplateIdAnnotation *TemplateId = D.getName().TemplateId;
	TemplateArgumentListInfo TemplateArgs =
	makeTemplateArgumentListInfo(this, TemplateId);
	SourceLocation TemplateNameLoc = D.getIdentifierLoc();
	SourceLocation LAngleLoc = TemplateId->LAngleLoc;
	SourceLocation RAngleLoc = TemplateId->RAngleLoc;

	TemplateName Name = TemplateId->Template.get();

	// The template-id must name a variable template.
	VarTemplateDecl *VarTemplate =
	dyn_cast_or_null<VarTemplateDecl>(Name.getAsTemplateDecl());
	if (!VarTemplate) {
	NamedDecl *FnTemplate;
	if (auto *OTS = Name.getAsOverloadedTemplate())
	FnTemplate = *OTS->begin();
	else
	FnTemplate = dyn_cast_or_null<FunctionTemplateDecl>(Name.getAsTemplateDecl());
	if (FnTemplate)
	return Diag(D.getIdentifierLoc(), diag::err_var_spec_no_template_but_method)
	<< FnTemplate->getDeclName();
	return Diag(D.getIdentifierLoc(), diag::err_var_spec_no_template)
	<< IsPartialSpecialization;
	}

	// Check for unexpanded parameter packs in any of the template arguments.
	for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
	if (DiagnoseUnexpandedParameterPack(TemplateArgs[I],
	UPPC_PartialSpecialization))
	return true;

	// Check that the template argument list is well-formed for this
	// template.
	SmallVector<TemplateArgument, 4> Converted;
	if (CheckTemplateArgumentList(VarTemplate, TemplateNameLoc, TemplateArgs,
	false, Converted))
	return true;

	// Find the variable template (partial) specialization declaration that
	// corresponds to these arguments.
	if (IsPartialSpecialization) {
	if (CheckTemplatePartialSpecializationArgs(TemplateNameLoc, VarTemplate,
	TemplateArgs.size(), Converted))
	return true;

	// FIXME: Move these checks to CheckTemplatePartialSpecializationArgs so we
	// also do them during instantiation.
	bool InstantiationDependent;
	if (!Name.isDependent() &&
	!TemplateSpecializationType::anyDependentTemplateArguments(
	TemplateArgs.arguments(),
	InstantiationDependent)) {
	Diag(TemplateNameLoc, diag::err_partial_spec_fully_specialized)
	<< VarTemplate->getDeclName();
	IsPartialSpecialization = false;
	}

	if (isSameAsPrimaryTemplate(VarTemplate->getTemplateParameters(),
	Converted)) {
	// C++ [temp.class.spec]p9b3:
	//
	// -- The argument list of the specialization shall not be identical
	// to the implicit argument list of the primary template.
	Diag(TemplateNameLoc, diag::err_partial_spec_args_match_primary_template)
	<< /variable template/ 1
	<< /is definition/(SC != SC_Extern && !CurContext->isRecord())
	<< FixItHint::CreateRemoval(SourceRange(LAngleLoc, RAngleLoc));
	// FIXME: Recover from this by treating the declaration as a redeclaration
	// of the primary template.
	return true;
	}
	}

	void *InsertPos = nullptr;
	VarTemplateSpecializationDecl *PrevDecl = nullptr;

	if (IsPartialSpecialization)
	// FIXME: Template parameter list matters too
	PrevDecl = VarTemplate->findPartialSpecialization(Converted, InsertPos);
	else
	PrevDecl = VarTemplate->findSpecialization(Converted, InsertPos);

	VarTemplateSpecializationDecl *Specialization = nullptr;

	// Check whether we can declare a variable template specialization in
	// the current scope.
	if (CheckTemplateSpecializationScope(*this, VarTemplate, PrevDecl,
	TemplateNameLoc,
	IsPartialSpecialization))
	return true;

	if (PrevDecl && PrevDecl->getSpecializationKind() == TSK_Undeclared) {
	// Since the only prior variable template specialization with these
	// arguments was referenced but not declared, reuse that
	// declaration node as our own, updating its source location and
	// the list of outer template parameters to reflect our new declaration.
	Specialization = PrevDecl;
	Specialization->setLocation(TemplateNameLoc);
	PrevDecl = nullptr;
	} else if (IsPartialSpecialization) {
	// Create a new class template partial specialization declaration node.
	VarTemplatePartialSpecializationDecl *PrevPartial =
	cast_or_null<VarTemplatePartialSpecializationDecl>(PrevDecl);
	VarTemplatePartialSpecializationDecl *Partial =
	VarTemplatePartialSpecializationDecl::Create(
	Context, VarTemplate->getDeclContext(), TemplateKWLoc,
	TemplateNameLoc, TemplateParams, VarTemplate, DI->getType(), DI, SC,
	Converted, TemplateArgs);

	if (!PrevPartial)
	VarTemplate->AddPartialSpecialization(Partial, InsertPos);
	Specialization = Partial;

	// If we are providing an explicit specialization of a member variable
	// template specialization, make a note of that.
	if (PrevPartial && PrevPartial->getInstantiatedFromMember())
	PrevPartial->setMemberSpecialization();

	CheckTemplatePartialSpecialization(Partial);
	} else {
	// Create a new class template specialization declaration node for
	// this explicit specialization or friend declaration.
	Specialization = VarTemplateSpecializationDecl::Create(
	Context, VarTemplate->getDeclContext(), TemplateKWLoc, TemplateNameLoc,
	VarTemplate, DI->getType(), DI, SC, Converted);
	Specialization->setTemplateArgsInfo(TemplateArgs);

	if (!PrevDecl)
	VarTemplate->AddSpecialization(Specialization, InsertPos);
	}

	// C++ [temp.expl.spec]p6:
	// If a template, a member template or the member of a class template is
	// explicitly specialized then that specialization shall be declared
	// before the first use of that specialization that would cause an implicit
	// instantiation to take place, in every translation unit in which such a
	// use occurs; no diagnostic is required.
	if (PrevDecl && PrevDecl->getPointOfInstantiation().isValid()) {
	bool Okay = false;
	for (Decl *Prev = PrevDecl; Prev; Prev = Prev->getPreviousDecl()) {
	// Is there any previous explicit specialization declaration?
	if (getTemplateSpecializationKind(Prev) == TSK_ExplicitSpecialization) {
	Okay = true;
	break;
	}
	}

	if (!Okay) {
	SourceRange Range(TemplateNameLoc, RAngleLoc);
	Diag(TemplateNameLoc, diag::err_specialization_after_instantiation)
	<< Name << Range;

	Diag(PrevDecl->getPointOfInstantiation(),
	diag::note_instantiation_required_here)
	<< (PrevDecl->getTemplateSpecializationKind() !=
	TSK_ImplicitInstantiation);
	return true;
	}
	}

	Specialization->setTemplateKeywordLoc(TemplateKWLoc);
	Specialization->setLexicalDeclContext(CurContext);

	// Add the specialization into its lexical context, so that it can
	// be seen when iterating through the list of declarations in that
	// context. However, specializations are not found by name lookup.
	CurContext->addDecl(Specialization);

	// Note that this is an explicit specialization.
	Specialization->setSpecializationKind(TSK_ExplicitSpecialization);

	if (PrevDecl) {
	// Check that this isn't a redefinition of this specialization,
	// merging with previous declarations.
	LookupResult PrevSpec(*this, GetNameForDeclarator(D), LookupOrdinaryName,
	forRedeclarationInCurContext());
	PrevSpec.addDecl(PrevDecl);
	D.setRedeclaration(CheckVariableDeclaration(Specialization, PrevSpec));
	} else if (Specialization->isStaticDataMember() &&
	Specialization->isOutOfLine()) {
	Specialization->setAccess(VarTemplate->getAccess());
	}

	return Specialization;
	}

	namespace {
	/// A partial specialization whose template arguments have matched
	/// a given template-id.
	struct PartialSpecMatchResult {
	VarTemplatePartialSpecializationDecl *Partial;
	TemplateArgumentList *Args;
	};
	} // end anonymous namespace

	DeclResult
	Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
	SourceLocation TemplateNameLoc,
	const TemplateArgumentListInfo &TemplateArgs) {
	assert(Template && "A variable template id without template?");

	// Check that the template argument list is well-formed for this template.
	SmallVector<TemplateArgument, 4> Converted;
	if (CheckTemplateArgumentList(
	Template, TemplateNameLoc,
	const_cast<TemplateArgumentListInfo &>(TemplateArgs), false,
	Converted))
	return true;

	// Find the variable template specialization declaration that
	// corresponds to these arguments.
	void *InsertPos = nullptr;
	if (VarTemplateSpecializationDecl *Spec = Template->findSpecialization(
	Converted, InsertPos)) {
	checkSpecializationVisibility(TemplateNameLoc, Spec);
	// If we already have a variable template specialization, return it.
	return Spec;
	}

	// This is the first time we have referenced this variable template
	// specialization. Create the canonical declaration and add it to
	// the set of specializations, based on the closest partial specialization
	// that it represents. That is,
	VarDecl *InstantiationPattern = Template->getTemplatedDecl();
	TemplateArgumentList TemplateArgList(TemplateArgumentList::OnStack,
	Converted);
	TemplateArgumentList *InstantiationArgs = &TemplateArgList;
	bool AmbiguousPartialSpec = false;
	typedef PartialSpecMatchResult MatchResult;
	SmallVector<MatchResult, 4> Matched;
	SourceLocation PointOfInstantiation = TemplateNameLoc;
	TemplateSpecCandidateSet FailedCandidates(PointOfInstantiation,
	/ForTakingAddress=/false);

	// 1. Attempt to find the closest partial specialization that this
	// specializes, if any.
	// If any of the template arguments is dependent, then this is probably
	// a placeholder for an incomplete declarative context; which must be
	// complete by instantiation time. Thus, do not search through the partial
	// specializations yet.
	// TODO: Unify with InstantiateClassTemplateSpecialization()?
	// Perhaps better after unification of DeduceTemplateArguments() and
	// getMoreSpecializedPartialSpecialization().
	bool InstantiationDependent = false;
	if (!TemplateSpecializationType::anyDependentTemplateArguments(
	TemplateArgs, InstantiationDependent)) {

	SmallVector<VarTemplatePartialSpecializationDecl *, 4> PartialSpecs;
	Template->getPartialSpecializations(PartialSpecs);

	for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) {
	VarTemplatePartialSpecializationDecl *Partial = PartialSpecs[I];
	TemplateDeductionInfo Info(FailedCandidates.getLocation());

	if (TemplateDeductionResult Result =
	DeduceTemplateArguments(Partial, TemplateArgList, Info)) {
	// Store the failed-deduction information for use in diagnostics, later.
	// TODO: Actually use the failed-deduction info?
	FailedCandidates.addCandidate().set(
	DeclAccessPair::make(Template, AS_public), Partial,
	MakeDeductionFailureInfo(Context, Result, Info));
	(void)Result;
	} else {
	Matched.push_back(PartialSpecMatchResult());
	Matched.back().Partial = Partial;
	Matched.back().Args = Info.take();
	}
	}

	if (Matched.size() >= 1) {
	SmallVector<MatchResult, 4>::iterator Best = Matched.begin();
	if (Matched.size() == 1) {
	// -- If exactly one matching specialization is found, the
	// instantiation is generated from that specialization.
	// We don't need to do anything for this.
	} else {
	// -- If more than one matching specialization is found, the
	// partial order rules (14.5.4.2) are used to determine
	// whether one of the specializations is more specialized
	// than the others. If none of the specializations is more
	// specialized than all of the other matching
	// specializations, then the use of the variable template is
	// ambiguous and the program is ill-formed.
	for (SmallVector<MatchResult, 4>::iterator P = Best + 1,
	PEnd = Matched.end();
	P != PEnd; ++P) {
	if (getMoreSpecializedPartialSpecialization(P->Partial, Best->Partial,
	PointOfInstantiation) ==
	P->Partial)
	Best = P;
	}

	// Determine if the best partial specialization is more specialized than
	// the others.
	for (SmallVector<MatchResult, 4>::iterator P = Matched.begin(),
	PEnd = Matched.end();
	P != PEnd; ++P) {
	if (P != Best && getMoreSpecializedPartialSpecialization(
	P->Partial, Best->Partial,
	PointOfInstantiation) != Best->Partial) {
	AmbiguousPartialSpec = true;
	break;
	}
	}
	}

	// Instantiate using the best variable template partial specialization.
	InstantiationPattern = Best->Partial;
	InstantiationArgs = Best->Args;
	} else {
	// -- If no match is found, the instantiation is generated
	// from the primary template.
	// InstantiationPattern = Template->getTemplatedDecl();
	}
	}

	// 2. Create the canonical declaration.
	// Note that we do not instantiate a definition until we see an odr-use
	// in DoMarkVarDeclReferenced().
	// FIXME: LateAttrs et al.?
	VarTemplateSpecializationDecl *Decl = BuildVarTemplateInstantiation(
	Template, InstantiationPattern, *InstantiationArgs, TemplateArgs,
	Converted, TemplateNameLoc, InsertPos /, LateAttrs, StartingScope/);
	if (!Decl)
	return true;

	if (AmbiguousPartialSpec) {
	// Partial ordering did not produce a clear winner. Complain.
	Decl->setInvalidDecl();
	Diag(PointOfInstantiation, diag::err_partial_spec_ordering_ambiguous)
	<< Decl;

	// Print the matching partial specializations.
	for (MatchResult P : Matched)
	Diag(P.Partial->getLocation(), diag::note_partial_spec_match)
	<< getTemplateArgumentBindingsText(P.Partial->getTemplateParameters(),
	*P.Args);
	return true;
	}

	if (VarTemplatePartialSpecializationDecl *D =
	dyn_cast<VarTemplatePartialSpecializationDecl>(InstantiationPattern))
	Decl->setInstantiationOf(D, InstantiationArgs);

	checkSpecializationVisibility(TemplateNameLoc, Decl);

	assert(Decl && "No variable template specialization?");
	return Decl;
	}

	ExprResult
	Sema::CheckVarTemplateId(const CXXScopeSpec &SS,
	const DeclarationNameInfo &NameInfo,
	VarTemplateDecl *Template, SourceLocation TemplateLoc,
	const TemplateArgumentListInfo *TemplateArgs) {

	DeclResult Decl = CheckVarTemplateId(Template, TemplateLoc, NameInfo.getLoc(),
	*TemplateArgs);
	if (Decl.isInvalid())
	return ExprError();

	VarDecl *Var = cast<VarDecl>(Decl.get());
	if (!Var->getTemplateSpecializationKind())
	Var->setTemplateSpecializationKind(TSK_ImplicitInstantiation,
	NameInfo.getLoc());

	// Build an ordinary singleton decl ref.
	return BuildDeclarationNameExpr(SS, NameInfo, Var,
	/FoundD=/nullptr, TemplateArgs);
	}

	void Sema::diagnoseMissingTemplateArguments(TemplateName Name,
	SourceLocation Loc) {
	Diag(Loc, diag::err_template_missing_args)
	<< (int)getTemplateNameKindForDiagnostics(Name) << Name;
	if (TemplateDecl *TD = Name.getAsTemplateDecl()) {
	Diag(TD->getLocation(), diag::note_template_decl_here)
	<< TD->getTemplateParameters()->getSourceRange();
	}
	}

	ExprResult
	Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
	const DeclarationNameInfo &NameInfo,
	ConceptDecl *Template,
	SourceLocation TemplateLoc,
	const TemplateArgumentListInfo *TemplateArgs) {
	// TODO: Do concept specialization here.
	Diag(NameInfo.getBeginLoc(), diag::err_concept_not_implemented) <<
	"concept specialization";
	return ExprError();
	}

	ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS,
	SourceLocation TemplateKWLoc,
	LookupResult &R,
	bool RequiresADL,
	const TemplateArgumentListInfo *TemplateArgs) {
	// FIXME: Can we do any checking at this point? I guess we could check the
	// template arguments that we have against the template name, if the template
	// name refers to a single template. That's not a terribly common case,
	// though.
	// foo<int> could identify a single function unambiguously
	// This approach does NOT work, since f<int>(1);
	// gets resolved prior to resorting to overload resolution
	// i.e., template<class T> void f(double);
	// vs template<class T, class U> void f(U);

	// These should be filtered out by our callers.
	assert(!R.isAmbiguous() && "ambiguous lookup when building templateid");

	// Non-function templates require a template argument list.
	if (auto *TD = R.getAsSingle<TemplateDecl>()) {
	if (!TemplateArgs && !isa<FunctionTemplateDecl>(TD)) {
	diagnoseMissingTemplateArguments(TemplateName(TD), R.getNameLoc());
	return ExprError();
	}
	}

	auto AnyDependentArguments = [&]() -> bool {
	bool InstantiationDependent;
	return TemplateArgs &&
	TemplateSpecializationType::anyDependentTemplateArguments(
	*TemplateArgs, InstantiationDependent);
	};

	// In C++1y, check variable template ids.
	if (R.getAsSingle<VarTemplateDecl>() && !AnyDependentArguments()) {
	return CheckVarTemplateId(SS, R.getLookupNameInfo(),
	R.getAsSingle<VarTemplateDecl>(),
	TemplateKWLoc, TemplateArgs);
	}

	if (R.getAsSingle<ConceptDecl>() && !AnyDependentArguments()) {
	return CheckConceptTemplateId(SS, R.getLookupNameInfo(),
	R.getAsSingle<ConceptDecl>(),
	TemplateKWLoc, TemplateArgs);
	}

	// We don't want lookup warnings at this point.
	R.suppressDiagnostics();

	UnresolvedLookupExpr *ULE
	= UnresolvedLookupExpr::Create(Context, R.getNamingClass(),
	SS.getWithLocInContext(Context),
	TemplateKWLoc,
	R.getLookupNameInfo(),
	RequiresADL, TemplateArgs,
	R.begin(), R.end());

	return ULE;
	}

	// We actually only call this from template instantiation.
	ExprResult
	Sema::BuildQualifiedTemplateIdExpr(CXXScopeSpec &SS,
	SourceLocation TemplateKWLoc,
	const DeclarationNameInfo &NameInfo,
	const TemplateArgumentListInfo *TemplateArgs) {

	assert(TemplateArgs \|\| TemplateKWLoc.isValid());
	DeclContext *DC;
	if (!(DC = computeDeclContext(SS, false)) \|\|
	DC->isDependentContext() \|\|
	RequireCompleteDeclContext(SS, DC))
	return BuildDependentDeclRefExpr(SS, TemplateKWLoc, NameInfo, TemplateArgs);

	bool MemberOfUnknownSpecialization;
	LookupResult R(*this, NameInfo, LookupOrdinaryName);
	if (LookupTemplateName(R, (Scope *)nullptr, SS, QualType(),
	/Entering/false, MemberOfUnknownSpecialization,
	TemplateKWLoc))
	return ExprError();

	if (R.isAmbiguous())
	return ExprError();

	if (R.empty()) {
	Diag(NameInfo.getLoc(), diag::err_no_member)
	<< NameInfo.getName() << DC << SS.getRange();
	return ExprError();
	}

	if (ClassTemplateDecl *Temp = R.getAsSingle<ClassTemplateDecl>()) {
	Diag(NameInfo.getLoc(), diag::err_template_kw_refers_to_class_template)
	<< SS.getScopeRep()
	<< NameInfo.getName().getAsString() << SS.getRange();
	Diag(Temp->getLocation(), diag::note_referenced_class_template);
	return ExprError();
	}

	return BuildTemplateIdExpr(SS, TemplateKWLoc, R, /ADL/ false, TemplateArgs);
	}

	/// Form a dependent template name.
	///
	/// This action forms a dependent template name given the template
	/// name and its (presumably dependent) scope specifier. For
	/// example, given "MetaFun::template apply", the scope specifier \p
	/// SS will be "MetaFun::", \p TemplateKWLoc contains the location
	/// of the "template" keyword, and "apply" is the \p Name.
	TemplateNameKind Sema::ActOnDependentTemplateName(Scope *S,
	CXXScopeSpec &SS,
	SourceLocation TemplateKWLoc,
	const UnqualifiedId &Name,
	ParsedType ObjectType,
	bool EnteringContext,
	TemplateTy &Result,
	bool AllowInjectedClassName) {
	if (TemplateKWLoc.isValid() && S && !S->getTemplateParamParent())
	Diag(TemplateKWLoc,
	getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_template_outside_of_template :
	diag::ext_template_outside_of_template)
	<< FixItHint::CreateRemoval(TemplateKWLoc);

	DeclContext *LookupCtx = nullptr;
	if (SS.isSet())
	LookupCtx = computeDeclContext(SS, EnteringContext);
	if (!LookupCtx && ObjectType)
	LookupCtx = computeDeclContext(ObjectType.get());
	if (LookupCtx) {
	// C++0x [temp.names]p5:
	// If a name prefixed by the keyword template is not the name of
	// a template, the program is ill-formed. [Note: the keyword
	// template may not be applied to non-template members of class
	// templates. -end note ] [ Note: as is the case with the
	// typename prefix, the template prefix is allowed in cases
	// where it is not strictly necessary; i.e., when the
	// nested-name-specifier or the expression on the left of the ->
	// or . is not dependent on a template-parameter, or the use
	// does not appear in the scope of a template. -end note]
	//
	// Note: C++03 was more strict here, because it banned the use of
	// the "template" keyword prior to a template-name that was not a
	// dependent name. C++ DR468 relaxed this requirement (the
	// "template" keyword is now permitted). We follow the C++0x
	// rules, even in C++03 mode with a warning, retroactively applying the DR.
	bool MemberOfUnknownSpecialization;
	TemplateNameKind TNK = isTemplateName(S, SS, TemplateKWLoc.isValid(), Name,
	ObjectType, EnteringContext, Result,
	MemberOfUnknownSpecialization);
	if (TNK == TNK_Non_template && MemberOfUnknownSpecialization) {
	// This is a dependent template. Handle it below.
	} else if (TNK == TNK_Non_template) {
	// Do the lookup again to determine if this is a "nothing found" case or
	// a "not a template" case. FIXME: Refactor isTemplateName so we don't
	// need to do this.
	DeclarationNameInfo DNI = GetNameFromUnqualifiedId(Name);
	LookupResult R(*this, DNI.getName(), Name.getBeginLoc(),
	LookupOrdinaryName);
	bool MOUS;
	if (!LookupTemplateName(R, S, SS, ObjectType.get(), EnteringContext,
	MOUS, TemplateKWLoc) && !R.isAmbiguous())
	Diag(Name.getBeginLoc(), diag::err_no_member)
	<< DNI.getName() << LookupCtx << SS.getRange();
	return TNK_Non_template;
	} else {
	// We found something; return it.
	auto *LookupRD = dyn_cast<CXXRecordDecl>(LookupCtx);
	if (!AllowInjectedClassName && SS.isSet() && LookupRD &&
	Name.getKind() == UnqualifiedIdKind::IK_Identifier &&
	Name.Identifier && LookupRD->getIdentifier() == Name.Identifier) {
	// C++14 [class.qual]p2:
	// In a lookup in which function names are not ignored and the
	// nested-name-specifier nominates a class C, if the name specified
	// [...] is the injected-class-name of C, [...] the name is instead
	// considered to name the constructor
	//
	// We don't get here if naming the constructor would be valid, so we
	// just reject immediately and recover by treating the
	// injected-class-name as naming the template.
	Diag(Name.getBeginLoc(),
	diag::ext_out_of_line_qualified_id_type_names_constructor)
	<< Name.Identifier
	<< 0 /injected-class-name used as template name/
	<< 1 /'template' keyword was used/;
	}
	return TNK;
	}
	}

	NestedNameSpecifier *Qualifier = SS.getScopeRep();

	switch (Name.getKind()) {
	case UnqualifiedIdKind::IK_Identifier:
	Result = TemplateTy::make(Context.getDependentTemplateName(Qualifier,
	Name.Identifier));
	return TNK_Dependent_template_name;

	case UnqualifiedIdKind::IK_OperatorFunctionId:
	Result = TemplateTy::make(Context.getDependentTemplateName(Qualifier,
	Name.OperatorFunctionId.Operator));
	return TNK_Function_template;

	case UnqualifiedIdKind::IK_LiteralOperatorId:
	llvm_unreachable("literal operator id cannot have a dependent scope");

	default:
	break;
	}

	Diag(Name.getBeginLoc(), diag::err_template_kw_refers_to_non_template)
	<< GetNameFromUnqualifiedId(Name).getName() << Name.getSourceRange()
	<< TemplateKWLoc;
	return TNK_Non_template;
	}

	bool Sema::CheckTemplateTypeArgument(TemplateTypeParmDecl *Param,
	TemplateArgumentLoc &AL,
	SmallVectorImpl<TemplateArgument> &Converted) {
	const TemplateArgument &Arg = AL.getArgument();
	QualType ArgType;
	TypeSourceInfo *TSI = nullptr;

	// Check template type parameter.
	switch(Arg.getKind()) {
	case TemplateArgument::Type:
	// C++ [temp.arg.type]p1:
	// A template-argument for a template-parameter which is a
	// type shall be a type-id.
	ArgType = Arg.getAsType();
	TSI = AL.getTypeSourceInfo();
	break;
	case TemplateArgument::Template:
	case TemplateArgument::TemplateExpansion: {
	// We have a template type parameter but the template argument
	// is a template without any arguments.
	SourceRange SR = AL.getSourceRange();
	TemplateName Name = Arg.getAsTemplateOrTemplatePattern();
	diagnoseMissingTemplateArguments(Name, SR.getEnd());
	return true;
	}
	case TemplateArgument::Expression: {
	// We have a template type parameter but the template argument is an
	// expression; see if maybe it is missing the "typename" keyword.
	CXXScopeSpec SS;
	DeclarationNameInfo NameInfo;

	if (DeclRefExpr *ArgExpr = dyn_cast<DeclRefExpr>(Arg.getAsExpr())) {
	SS.Adopt(ArgExpr->getQualifierLoc());
	NameInfo = ArgExpr->getNameInfo();
	} else if (DependentScopeDeclRefExpr *ArgExpr =
	dyn_cast<DependentScopeDeclRefExpr>(Arg.getAsExpr())) {
	SS.Adopt(ArgExpr->getQualifierLoc());
	NameInfo = ArgExpr->getNameInfo();
	} else if (CXXDependentScopeMemberExpr *ArgExpr =
	dyn_cast<CXXDependentScopeMemberExpr>(Arg.getAsExpr())) {
	if (ArgExpr->isImplicitAccess()) {
	SS.Adopt(ArgExpr->getQualifierLoc());
	NameInfo = ArgExpr->getMemberNameInfo();
	}
	}

	if (auto *II = NameInfo.getName().getAsIdentifierInfo()) {
	LookupResult Result(*this, NameInfo, LookupOrdinaryName);
	LookupParsedName(Result, CurScope, &SS);

	if (Result.getAsSingle<TypeDecl>() \|\|
	Result.getResultKind() ==
	LookupResult::NotFoundInCurrentInstantiation) {
	// Suggest that the user add 'typename' before the NNS.
	SourceLocation Loc = AL.getSourceRange().getBegin();
	Diag(Loc, getLangOpts().MSVCCompat
	? diag::ext_ms_template_type_arg_missing_typename
	: diag::err_template_arg_must_be_type_suggest)
	<< FixItHint::CreateInsertion(Loc, "typename ");
	Diag(Param->getLocation(), diag::note_template_param_here);

	// Recover by synthesizing a type using the location information that we
	// already have.
	ArgType =
	Context.getDependentNameType(ETK_Typename, SS.getScopeRep(), II);
	TypeLocBuilder TLB;
	DependentNameTypeLoc TL = TLB.push<DependentNameTypeLoc>(ArgType);
	TL.setElaboratedKeywordLoc(SourceLocation(/synthesized/));
	TL.setQualifierLoc(SS.getWithLocInContext(Context));
	TL.setNameLoc(NameInfo.getLoc());
	TSI = TLB.getTypeSourceInfo(Context, ArgType);

	// Overwrite our input TemplateArgumentLoc so that we can recover
	// properly.
	AL = TemplateArgumentLoc(TemplateArgument(ArgType),
	TemplateArgumentLocInfo(TSI));

	break;
	}
	}
	// fallthrough
	LLVM_FALLTHROUGH;
	}
	default: {
	// We have a template type parameter but the template argument
	// is not a type.
	SourceRange SR = AL.getSourceRange();
	Diag(SR.getBegin(), diag::err_template_arg_must_be_type) << SR;
	Diag(Param->getLocation(), diag::note_template_param_here);

	return true;
	}
	}

	if (CheckTemplateArgument(Param, TSI))
	return true;

	// Add the converted template type argument.
	ArgType = Context.getCanonicalType(ArgType);

	// Objective-C ARC:
	// If an explicitly-specified template argument type is a lifetime type
	// with no lifetime qualifier, the __strong lifetime qualifier is inferred.
	if (getLangOpts().ObjCAutoRefCount &&
	ArgType->isObjCLifetimeType() &&
	!ArgType.getObjCLifetime()) {
	Qualifiers Qs;
	Qs.setObjCLifetime(Qualifiers::OCL_Strong);
	ArgType = Context.getQualifiedType(ArgType, Qs);
	}

	Converted.push_back(TemplateArgument(ArgType));
	return false;
	}

	/// Substitute template arguments into the default template argument for
	/// the given template type parameter.
	///
	/// \param SemaRef the semantic analysis object for which we are performing
	/// the substitution.
	///
	/// \param Template the template that we are synthesizing template arguments
	/// for.
	///
	/// \param TemplateLoc the location of the template name that started the
	/// template-id we are checking.
	///
	/// \param RAngleLoc the location of the right angle bracket ('>') that
	/// terminates the template-id.
	///
	/// \param Param the template template parameter whose default we are
	/// substituting into.
	///
	/// \param Converted the list of template arguments provided for template
	/// parameters that precede \p Param in the template parameter list.
	/// \returns the substituted template argument, or NULL if an error occurred.
	static TypeSourceInfo *
	SubstDefaultTemplateArgument(Sema &SemaRef,
	TemplateDecl *Template,
	SourceLocation TemplateLoc,
	SourceLocation RAngleLoc,
	TemplateTypeParmDecl *Param,
	SmallVectorImpl<TemplateArgument> &Converted) {
	TypeSourceInfo *ArgType = Param->getDefaultArgumentInfo();

	// If the argument type is dependent, instantiate it now based
	// on the previously-computed template arguments.
	if (ArgType->getType()->isInstantiationDependentType()) {
	Sema::InstantiatingTemplate Inst(SemaRef, TemplateLoc,
	Param, Template, Converted,
	SourceRange(TemplateLoc, RAngleLoc));
	if (Inst.isInvalid())
	return nullptr;

	TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Converted);

	// Only substitute for the innermost template argument list.
	MultiLevelTemplateArgumentList TemplateArgLists;
	TemplateArgLists.addOuterTemplateArguments(&TemplateArgs);
	for (unsigned i = 0, e = Param->getDepth(); i != e; ++i)
	TemplateArgLists.addOuterTemplateArguments(None);

	Sema::ContextRAII SavedContext(SemaRef, Template->getDeclContext());
	ArgType =
	SemaRef.SubstType(ArgType, TemplateArgLists,
	Param->getDefaultArgumentLoc(), Param->getDeclName());
	}

	return ArgType;
	}

	/// Substitute template arguments into the default template argument for
	/// the given non-type template parameter.
	///
	/// \param SemaRef the semantic analysis object for which we are performing
	/// the substitution.
	///
	/// \param Template the template that we are synthesizing template arguments
	/// for.
	///
	/// \param TemplateLoc the location of the template name that started the
	/// template-id we are checking.
	///
	/// \param RAngleLoc the location of the right angle bracket ('>') that
	/// terminates the template-id.
	///
	/// \param Param the non-type template parameter whose default we are
	/// substituting into.
	///
	/// \param Converted the list of template arguments provided for template
	/// parameters that precede \p Param in the template parameter list.
	///
	/// \returns the substituted template argument, or NULL if an error occurred.
	static ExprResult
	SubstDefaultTemplateArgument(Sema &SemaRef,
	TemplateDecl *Template,
	SourceLocation TemplateLoc,
	SourceLocation RAngleLoc,
	NonTypeTemplateParmDecl *Param,
	SmallVectorImpl<TemplateArgument> &Converted) {
	Sema::InstantiatingTemplate Inst(SemaRef, TemplateLoc,
	Param, Template, Converted,
	SourceRange(TemplateLoc, RAngleLoc));
	if (Inst.isInvalid())
	return ExprError();

	TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Converted);

	// Only substitute for the innermost template argument list.
	MultiLevelTemplateArgumentList TemplateArgLists;
	TemplateArgLists.addOuterTemplateArguments(&TemplateArgs);
	for (unsigned i = 0, e = Param->getDepth(); i != e; ++i)
	TemplateArgLists.addOuterTemplateArguments(None);

	+ Sema::ContextRAII SavedContext(SemaRef, Template->getDeclContext());
	EnterExpressionEvaluationContext ConstantEvaluated(
	SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
	return SemaRef.SubstExpr(Param->getDefaultArgument(), TemplateArgLists);
	}

	/// Substitute template arguments into the default template argument for
	/// the given template template parameter.
	///
	/// \param SemaRef the semantic analysis object for which we are performing
	/// the substitution.
	///
	/// \param Template the template that we are synthesizing template arguments
	/// for.
	///
	/// \param TemplateLoc the location of the template name that started the
	/// template-id we are checking.
	///
	/// \param RAngleLoc the location of the right angle bracket ('>') that
	/// terminates the template-id.
	///
	/// \param Param the template template parameter whose default we are
	/// substituting into.
	///
	/// \param Converted the list of template arguments provided for template
	/// parameters that precede \p Param in the template parameter list.
	///
	/// \param QualifierLoc Will be set to the nested-name-specifier (with
	/// source-location information) that precedes the template name.
	///
	/// \returns the substituted template argument, or NULL if an error occurred.
	static TemplateName
	SubstDefaultTemplateArgument(Sema &SemaRef,
	TemplateDecl *Template,
	SourceLocation TemplateLoc,
	SourceLocation RAngleLoc,
	TemplateTemplateParmDecl *Param,
	SmallVectorImpl<TemplateArgument> &Converted,
	NestedNameSpecifierLoc &QualifierLoc) {
	Sema::InstantiatingTemplate Inst(
	SemaRef, TemplateLoc, TemplateParameter(Param), Template, Converted,
	SourceRange(TemplateLoc, RAngleLoc));
	if (Inst.isInvalid())
	return TemplateName();

	TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Converted);

	// Only substitute for the innermost template argument list.
	MultiLevelTemplateArgumentList TemplateArgLists;
	TemplateArgLists.addOuterTemplateArguments(&TemplateArgs);
	for (unsigned i = 0, e = Param->getDepth(); i != e; ++i)
	TemplateArgLists.addOuterTemplateArguments(None);

	Sema::ContextRAII SavedContext(SemaRef, Template->getDeclContext());
	// Substitute into the nested-name-specifier first,
	QualifierLoc = Param->getDefaultArgument().getTemplateQualifierLoc();
	if (QualifierLoc) {
	QualifierLoc =
	SemaRef.SubstNestedNameSpecifierLoc(QualifierLoc, TemplateArgLists);
	if (!QualifierLoc)
	return TemplateName();
	}

	return SemaRef.SubstTemplateName(
	QualifierLoc,
	Param->getDefaultArgument().getArgument().getAsTemplate(),
	Param->getDefaultArgument().getTemplateNameLoc(),
	TemplateArgLists);
	}

	/// If the given template parameter has a default template
	/// argument, substitute into that default template argument and
	/// return the corresponding template argument.
	TemplateArgumentLoc
	Sema::SubstDefaultTemplateArgumentIfAvailable(TemplateDecl *Template,
	SourceLocation TemplateLoc,
	SourceLocation RAngleLoc,
	Decl *Param,
	SmallVectorImpl<TemplateArgument>
	&Converted,
	bool &HasDefaultArg) {
	HasDefaultArg = false;

	if (TemplateTypeParmDecl *TypeParm = dyn_cast<TemplateTypeParmDecl>(Param)) {
	if (!hasVisibleDefaultArgument(TypeParm))
	return TemplateArgumentLoc();

	HasDefaultArg = true;
	TypeSourceInfo DI = SubstDefaultTemplateArgument(this, Template,
	TemplateLoc,
	RAngleLoc,
	TypeParm,
	Converted);
	if (DI)
	return TemplateArgumentLoc(TemplateArgument(DI->getType()), DI);

	return TemplateArgumentLoc();
	}

	if (NonTypeTemplateParmDecl *NonTypeParm
	= dyn_cast<NonTypeTemplateParmDecl>(Param)) {
	if (!hasVisibleDefaultArgument(NonTypeParm))
	return TemplateArgumentLoc();

	HasDefaultArg = true;
	ExprResult Arg = SubstDefaultTemplateArgument(*this, Template,
	TemplateLoc,
	RAngleLoc,
	NonTypeParm,
	Converted);
	if (Arg.isInvalid())
	return TemplateArgumentLoc();

	Expr *ArgE = Arg.getAs<Expr>();
	return TemplateArgumentLoc(TemplateArgument(ArgE), ArgE);
	}

	TemplateTemplateParmDecl *TempTempParm
	= cast<TemplateTemplateParmDecl>(Param);
	if (!hasVisibleDefaultArgument(TempTempParm))
	return TemplateArgumentLoc();

	HasDefaultArg = true;
	NestedNameSpecifierLoc QualifierLoc;
	TemplateName TName = SubstDefaultTemplateArgument(*this, Template,
	TemplateLoc,
	RAngleLoc,
	TempTempParm,
	Converted,
	QualifierLoc);
	if (TName.isNull())
	return TemplateArgumentLoc();

	return TemplateArgumentLoc(TemplateArgument(TName),
	TempTempParm->getDefaultArgument().getTemplateQualifierLoc(),
	TempTempParm->getDefaultArgument().getTemplateNameLoc());
	}

	/// Convert a template-argument that we parsed as a type into a template, if
	/// possible. C++ permits injected-class-names to perform dual service as
	/// template template arguments and as template type arguments.
	static TemplateArgumentLoc convertTypeTemplateArgumentToTemplate(TypeLoc TLoc) {
	// Extract and step over any surrounding nested-name-specifier.
	NestedNameSpecifierLoc QualLoc;
	if (auto ETLoc = TLoc.getAs<ElaboratedTypeLoc>()) {
	if (ETLoc.getTypePtr()->getKeyword() != ETK_None)
	return TemplateArgumentLoc();

	QualLoc = ETLoc.getQualifierLoc();
	TLoc = ETLoc.getNamedTypeLoc();
	}

	// If this type was written as an injected-class-name, it can be used as a
	// template template argument.
	if (auto InjLoc = TLoc.getAs<InjectedClassNameTypeLoc>())
	return TemplateArgumentLoc(InjLoc.getTypePtr()->getTemplateName(),
	QualLoc, InjLoc.getNameLoc());

	// If this type was written as an injected-class-name, it may have been
	// converted to a RecordType during instantiation. If the RecordType is
	// not wrapped in a TemplateSpecializationType and denotes a class
	// template specialization, it must have come from an injected-class-name.
	if (auto RecLoc = TLoc.getAs<RecordTypeLoc>())
	if (auto *CTSD =
	dyn_cast<ClassTemplateSpecializationDecl>(RecLoc.getDecl()))
	return TemplateArgumentLoc(TemplateName(CTSD->getSpecializedTemplate()),
	QualLoc, RecLoc.getNameLoc());

	return TemplateArgumentLoc();
	}

	/// Check that the given template argument corresponds to the given
	/// template parameter.
	///
	/// \param Param The template parameter against which the argument will be
	/// checked.
	///
	/// \param Arg The template argument, which may be updated due to conversions.
	///
	/// \param Template The template in which the template argument resides.
	///
	/// \param TemplateLoc The location of the template name for the template
	/// whose argument list we're matching.
	///
	/// \param RAngleLoc The location of the right angle bracket ('>') that closes
	/// the template argument list.
	///
	/// \param ArgumentPackIndex The index into the argument pack where this
	/// argument will be placed. Only valid if the parameter is a parameter pack.
	///
	/// \param Converted The checked, converted argument will be added to the
	/// end of this small vector.
	///
	/// \param CTAK Describes how we arrived at this particular template argument:
	/// explicitly written, deduced, etc.
	///
	/// \returns true on error, false otherwise.
	bool Sema::CheckTemplateArgument(NamedDecl *Param,
	TemplateArgumentLoc &Arg,
	NamedDecl *Template,
	SourceLocation TemplateLoc,
	SourceLocation RAngleLoc,
	unsigned ArgumentPackIndex,
	SmallVectorImpl<TemplateArgument> &Converted,
	CheckTemplateArgumentKind CTAK) {
	// Check template type parameters.
	if (TemplateTypeParmDecl *TTP = dyn_cast<TemplateTypeParmDecl>(Param))
	return CheckTemplateTypeArgument(TTP, Arg, Converted);

	// Check non-type template parameters.
	if (NonTypeTemplateParmDecl *NTTP =dyn_cast<NonTypeTemplateParmDecl>(Param)) {
	// Do substitution on the type of the non-type template parameter
	// with the template arguments we've seen thus far. But if the
	// template has a dependent context then we cannot substitute yet.
	QualType NTTPType = NTTP->getType();
	if (NTTP->isParameterPack() && NTTP->isExpandedParameterPack())
	NTTPType = NTTP->getExpansionType(ArgumentPackIndex);

	// FIXME: Do we need to substitute into parameters here if they're
	// instantiation-dependent but not dependent?
	if (NTTPType->isDependentType() &&
	!isa<TemplateTemplateParmDecl>(Template) &&
	!Template->getDeclContext()->isDependentContext()) {
	// Do substitution on the type of the non-type template parameter.
	InstantiatingTemplate Inst(*this, TemplateLoc, Template,
	NTTP, Converted,
	SourceRange(TemplateLoc, RAngleLoc));
	if (Inst.isInvalid())
	return true;

	TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
	Converted);

	// If the parameter is a pack expansion, expand this slice of the pack.
	if (auto *PET = NTTPType->getAs<PackExpansionType>()) {
	Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(*this,
	ArgumentPackIndex);
	NTTPType = SubstType(PET->getPattern(),
	MultiLevelTemplateArgumentList(TemplateArgs),
	NTTP->getLocation(),
	NTTP->getDeclName());
	} else {
	NTTPType = SubstType(NTTPType,
	MultiLevelTemplateArgumentList(TemplateArgs),
	NTTP->getLocation(),
	NTTP->getDeclName());
	}

	// If that worked, check the non-type template parameter type
	// for validity.
	if (!NTTPType.isNull())
	NTTPType = CheckNonTypeTemplateParameterType(NTTPType,
	NTTP->getLocation());
	if (NTTPType.isNull())
	return true;
	}

	switch (Arg.getArgument().getKind()) {
	case TemplateArgument::Null:
	llvm_unreachable("Should never see a NULL template argument here");

	case TemplateArgument::Expression: {
	TemplateArgument Result;
	unsigned CurSFINAEErrors = NumSFINAEErrors;
	ExprResult Res =
	CheckTemplateArgument(NTTP, NTTPType, Arg.getArgument().getAsExpr(),
	Result, CTAK);
	if (Res.isInvalid())
	return true;
	// If the current template argument causes an error, give up now.
	if (CurSFINAEErrors < NumSFINAEErrors)
	return true;

	// If the resulting expression is new, then use it in place of the
	// old expression in the template argument.
	if (Res.get() != Arg.getArgument().getAsExpr()) {
	TemplateArgument TA(Res.get());
	Arg = TemplateArgumentLoc(TA, Res.get());
	}

	Converted.push_back(Result);
	break;
	}

	case TemplateArgument::Declaration:
	case TemplateArgument::Integral:
	case TemplateArgument::NullPtr:
	// We've already checked this template argument, so just copy
	// it to the list of converted arguments.
	Converted.push_back(Arg.getArgument());
	break;

	case TemplateArgument::Template:
	case TemplateArgument::TemplateExpansion:
	// We were given a template template argument. It may not be ill-formed;
	// see below.
	if (DependentTemplateName *DTN
	= Arg.getArgument().getAsTemplateOrTemplatePattern()
	.getAsDependentTemplateName()) {
	// We have a template argument such as \c T::template X, which we
	// parsed as a template template argument. However, since we now
	// know that we need a non-type template argument, convert this
	// template name into an expression.

	DeclarationNameInfo NameInfo(DTN->getIdentifier(),
	Arg.getTemplateNameLoc());

	CXXScopeSpec SS;
	SS.Adopt(Arg.getTemplateQualifierLoc());
	// FIXME: the template-template arg was a DependentTemplateName,
	// so it was provided with a template keyword. However, its source
	// location is not stored in the template argument structure.
	SourceLocation TemplateKWLoc;
	ExprResult E = DependentScopeDeclRefExpr::Create(
	Context, SS.getWithLocInContext(Context), TemplateKWLoc, NameInfo,
	nullptr);

	// If we parsed the template argument as a pack expansion, create a
	// pack expansion expression.
	if (Arg.getArgument().getKind() == TemplateArgument::TemplateExpansion){
	E = ActOnPackExpansion(E.get(), Arg.getTemplateEllipsisLoc());
	if (E.isInvalid())
	return true;
	}

	TemplateArgument Result;
	E = CheckTemplateArgument(NTTP, NTTPType, E.get(), Result);
	if (E.isInvalid())
	return true;

	Converted.push_back(Result);
	break;
	}

	// We have a template argument that actually does refer to a class
	// template, alias template, or template template parameter, and
	// therefore cannot be a non-type template argument.
	Diag(Arg.getLocation(), diag::err_template_arg_must_be_expr)
	<< Arg.getSourceRange();

	Diag(Param->getLocation(), diag::note_template_param_here);
	return true;

	case TemplateArgument::Type: {
	// We have a non-type template parameter but the template
	// argument is a type.

	// C++ [temp.arg]p2:
	// In a template-argument, an ambiguity between a type-id and
	// an expression is resolved to a type-id, regardless of the
	// form of the corresponding template-parameter.
	//
	// We warn specifically about this case, since it can be rather
	// confusing for users.
	QualType T = Arg.getArgument().getAsType();
	SourceRange SR = Arg.getSourceRange();
	if (T->isFunctionType())
	Diag(SR.getBegin(), diag::err_template_arg_nontype_ambig) << SR << T;
	else
	Diag(SR.getBegin(), diag::err_template_arg_must_be_expr) << SR;
	Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}

	case TemplateArgument::Pack:
	llvm_unreachable("Caller must expand template argument packs");
	}

	return false;
	}


	// Check template template parameters.
	TemplateTemplateParmDecl *TempParm = cast<TemplateTemplateParmDecl>(Param);

	TemplateParameterList *Params = TempParm->getTemplateParameters();
	if (TempParm->isExpandedParameterPack())
	Params = TempParm->getExpansionTemplateParameters(ArgumentPackIndex);

	// Substitute into the template parameter list of the template
	// template parameter, since previously-supplied template arguments
	// may appear within the template template parameter.
	//
	// FIXME: Skip this if the parameters aren't instantiation-dependent.
	{
	// Set up a template instantiation context.
	LocalInstantiationScope Scope(*this);
	InstantiatingTemplate Inst(*this, TemplateLoc, Template,
	TempParm, Converted,
	SourceRange(TemplateLoc, RAngleLoc));
	if (Inst.isInvalid())
	return true;

	TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack, Converted);
	Params = SubstTemplateParams(Params, CurContext,
	MultiLevelTemplateArgumentList(TemplateArgs));
	if (!Params)
	return true;
	}

	// C++1z [temp.local]p1: (DR1004)
	// When [the injected-class-name] is used [...] as a template-argument for
	// a template template-parameter [...] it refers to the class template
	// itself.
	if (Arg.getArgument().getKind() == TemplateArgument::Type) {
	TemplateArgumentLoc ConvertedArg = convertTypeTemplateArgumentToTemplate(
	Arg.getTypeSourceInfo()->getTypeLoc());
	if (!ConvertedArg.getArgument().isNull())
	Arg = ConvertedArg;
	}

	switch (Arg.getArgument().getKind()) {
	case TemplateArgument::Null:
	llvm_unreachable("Should never see a NULL template argument here");

	case TemplateArgument::Template:
	case TemplateArgument::TemplateExpansion:
	if (CheckTemplateTemplateArgument(Params, Arg))
	return true;

	Converted.push_back(Arg.getArgument());
	break;

	case TemplateArgument::Expression:
	case TemplateArgument::Type:
	// We have a template template parameter but the template
	// argument does not refer to a template.
	Diag(Arg.getLocation(), diag::err_template_arg_must_be_template)
	<< getLangOpts().CPlusPlus11;
	return true;

	case TemplateArgument::Declaration:
	llvm_unreachable("Declaration argument with template template parameter");
	case TemplateArgument::Integral:
	llvm_unreachable("Integral argument with template template parameter");
	case TemplateArgument::NullPtr:
	llvm_unreachable("Null pointer argument with template template parameter");

	case TemplateArgument::Pack:
	llvm_unreachable("Caller must expand template argument packs");
	}

	return false;
	}

	/// Check whether the template parameter is a pack expansion, and if so,
	/// determine the number of parameters produced by that expansion. For instance:
	///
	/// \code
	/// template<typename ...Ts> struct A {
	/// template<Ts ...NTs, template<Ts> class ...TTs, typename ...Us> struct B;
	/// };
	/// \endcode
	///
	/// In \c A<int,int>::B, \c NTs and \c TTs have expanded pack size 2, and \c Us
	/// is not a pack expansion, so returns an empty Optional.
	static Optional<unsigned> getExpandedPackSize(NamedDecl *Param) {
	if (NonTypeTemplateParmDecl *NTTP
	= dyn_cast<NonTypeTemplateParmDecl>(Param)) {
	if (NTTP->isExpandedParameterPack())
	return NTTP->getNumExpansionTypes();
	}

	if (TemplateTemplateParmDecl *TTP
	= dyn_cast<TemplateTemplateParmDecl>(Param)) {
	if (TTP->isExpandedParameterPack())
	return TTP->getNumExpansionTemplateParameters();
	}

	return None;
	}

	/// Diagnose a missing template argument.
	template<typename TemplateParmDecl>
	static bool diagnoseMissingArgument(Sema &S, SourceLocation Loc,
	TemplateDecl *TD,
	const TemplateParmDecl *D,
	TemplateArgumentListInfo &Args) {
	// Dig out the most recent declaration of the template parameter; there may be
	// declarations of the template that are more recent than TD.
	D = cast<TemplateParmDecl>(cast<TemplateDecl>(TD->getMostRecentDecl())
	->getTemplateParameters()
	->getParam(D->getIndex()));

	// If there's a default argument that's not visible, diagnose that we're
	// missing a module import.
	llvm::SmallVector<Module*, 8> Modules;
	if (D->hasDefaultArgument() && !S.hasVisibleDefaultArgument(D, &Modules)) {
	S.diagnoseMissingImport(Loc, cast<NamedDecl>(TD),
	D->getDefaultArgumentLoc(), Modules,
	Sema::MissingImportKind::DefaultArgument,
	/Recover/true);
	return true;
	}

	// FIXME: If there's a more recent default argument that is visible,
	// diagnose that it was declared too late.

	TemplateParameterList *Params = TD->getTemplateParameters();

	S.Diag(Loc, diag::err_template_arg_list_different_arity)
	<< /not enough args/0
	<< (int)S.getTemplateNameKindForDiagnostics(TemplateName(TD))
	<< TD;
	S.Diag(TD->getLocation(), diag::note_template_decl_here)
	<< Params->getSourceRange();
	return true;
	}

	/// Check that the given template argument list is well-formed
	/// for specializing the given template.
	bool Sema::CheckTemplateArgumentList(
	TemplateDecl *Template, SourceLocation TemplateLoc,
	TemplateArgumentListInfo &TemplateArgs, bool PartialTemplateArgs,
	SmallVectorImpl<TemplateArgument> &Converted,
	bool UpdateArgsWithConversions) {
	// Make a copy of the template arguments for processing. Only make the
	// changes at the end when successful in matching the arguments to the
	// template.
	TemplateArgumentListInfo NewArgs = TemplateArgs;

	// Make sure we get the template parameter list from the most
	// recentdeclaration, since that is the only one that has is guaranteed to
	// have all the default template argument information.
	TemplateParameterList *Params =
	cast<TemplateDecl>(Template->getMostRecentDecl())
	->getTemplateParameters();

	SourceLocation RAngleLoc = NewArgs.getRAngleLoc();

	// C++ [temp.arg]p1:
	// [...] The type and form of each template-argument specified in
	// a template-id shall match the type and form specified for the
	// corresponding parameter declared by the template in its
	// template-parameter-list.
	bool isTemplateTemplateParameter = isa<TemplateTemplateParmDecl>(Template);
	SmallVector<TemplateArgument, 2> ArgumentPack;
	unsigned ArgIdx = 0, NumArgs = NewArgs.size();
	LocalInstantiationScope InstScope(*this, true);
	for (TemplateParameterList::iterator Param = Params->begin(),
	ParamEnd = Params->end();
	Param != ParamEnd; /* increment in loop */) {
	// If we have an expanded parameter pack, make sure we don't have too
	// many arguments.
	if (Optional<unsigned> Expansions = getExpandedPackSize(*Param)) {
	if (*Expansions == ArgumentPack.size()) {
	// We're done with this parameter pack. Pack up its arguments and add
	// them to the list.
	Converted.push_back(
	TemplateArgument::CreatePackCopy(Context, ArgumentPack));
	ArgumentPack.clear();

	// This argument is assigned to the next parameter.
	++Param;
	continue;
	} else if (ArgIdx == NumArgs && !PartialTemplateArgs) {
	// Not enough arguments for this parameter pack.
	Diag(TemplateLoc, diag::err_template_arg_list_different_arity)
	<< /not enough args/0
	<< (int)getTemplateNameKindForDiagnostics(TemplateName(Template))
	<< Template;
	Diag(Template->getLocation(), diag::note_template_decl_here)
	<< Params->getSourceRange();
	return true;
	}
	}

	if (ArgIdx < NumArgs) {
	// Check the template argument we were given.
	if (CheckTemplateArgument(*Param, NewArgs[ArgIdx], Template,
	TemplateLoc, RAngleLoc,
	ArgumentPack.size(), Converted))
	return true;

	bool PackExpansionIntoNonPack =
	NewArgs[ArgIdx].getArgument().isPackExpansion() &&
	(!(Param)->isTemplateParameterPack() \|\| getExpandedPackSize(Param));
	if (PackExpansionIntoNonPack && isa<TypeAliasTemplateDecl>(Template)) {
	// Core issue 1430: we have a pack expansion as an argument to an
	// alias template, and it's not part of a parameter pack. This
	// can't be canonicalized, so reject it now.
	Diag(NewArgs[ArgIdx].getLocation(),
	diag::err_alias_template_expansion_into_fixed_list)
	<< NewArgs[ArgIdx].getSourceRange();
	Diag((*Param)->getLocation(), diag::note_template_param_here);
	return true;
	}

	// We're now done with this argument.
	++ArgIdx;

	if ((*Param)->isTemplateParameterPack()) {
	// The template parameter was a template parameter pack, so take the
	// deduced argument and place it on the argument pack. Note that we
	// stay on the same template parameter so that we can deduce more
	// arguments.
	ArgumentPack.push_back(Converted.pop_back_val());
	} else {
	// Move to the next template parameter.
	++Param;
	}

	// If we just saw a pack expansion into a non-pack, then directly convert
	// the remaining arguments, because we don't know what parameters they'll
	// match up with.
	if (PackExpansionIntoNonPack) {
	if (!ArgumentPack.empty()) {
	// If we were part way through filling in an expanded parameter pack,
	// fall back to just producing individual arguments.
	Converted.insert(Converted.end(),
	ArgumentPack.begin(), ArgumentPack.end());
	ArgumentPack.clear();
	}

	while (ArgIdx < NumArgs) {
	Converted.push_back(NewArgs[ArgIdx].getArgument());
	++ArgIdx;
	}

	return false;
	}

	continue;
	}

	// If we're checking a partial template argument list, we're done.
	if (PartialTemplateArgs) {
	if ((*Param)->isTemplateParameterPack() && !ArgumentPack.empty())
	Converted.push_back(
	TemplateArgument::CreatePackCopy(Context, ArgumentPack));

	return false;
	}

	// If we have a template parameter pack with no more corresponding
	// arguments, just break out now and we'll fill in the argument pack below.
	if ((*Param)->isTemplateParameterPack()) {
	assert(!getExpandedPackSize(*Param) &&
	"Should have dealt with this already");

	// A non-expanded parameter pack before the end of the parameter list
	// only occurs for an ill-formed template parameter list, unless we've
	// got a partial argument list for a function template, so just bail out.
	if (Param + 1 != ParamEnd)
	return true;

	Converted.push_back(
	TemplateArgument::CreatePackCopy(Context, ArgumentPack));
	ArgumentPack.clear();

	++Param;
	continue;
	}

	// Check whether we have a default argument.
	TemplateArgumentLoc Arg;

	// Retrieve the default template argument from the template
	// parameter. For each kind of template parameter, we substitute the
	// template arguments provided thus far and any "outer" template arguments
	// (when the template parameter was part of a nested template) into
	// the default argument.
	if (TemplateTypeParmDecl TTP = dyn_cast<TemplateTypeParmDecl>(Param)) {
	if (!hasVisibleDefaultArgument(TTP))
	return diagnoseMissingArgument(*this, TemplateLoc, Template, TTP,
	NewArgs);

	TypeSourceInfo ArgType = SubstDefaultTemplateArgument(this,
	Template,
	TemplateLoc,
	RAngleLoc,
	TTP,
	Converted);
	if (!ArgType)
	return true;

	Arg = TemplateArgumentLoc(TemplateArgument(ArgType->getType()),
	ArgType);
	} else if (NonTypeTemplateParmDecl *NTTP
	= dyn_cast<NonTypeTemplateParmDecl>(*Param)) {
	if (!hasVisibleDefaultArgument(NTTP))
	return diagnoseMissingArgument(*this, TemplateLoc, Template, NTTP,
	NewArgs);

	ExprResult E = SubstDefaultTemplateArgument(*this, Template,
	TemplateLoc,
	RAngleLoc,
	NTTP,
	Converted);
	if (E.isInvalid())
	return true;

	Expr *Ex = E.getAs<Expr>();
	Arg = TemplateArgumentLoc(TemplateArgument(Ex), Ex);
	} else {
	TemplateTemplateParmDecl *TempParm
	= cast<TemplateTemplateParmDecl>(*Param);

	if (!hasVisibleDefaultArgument(TempParm))
	return diagnoseMissingArgument(*this, TemplateLoc, Template, TempParm,
	NewArgs);

	NestedNameSpecifierLoc QualifierLoc;
	TemplateName Name = SubstDefaultTemplateArgument(*this, Template,
	TemplateLoc,
	RAngleLoc,
	TempParm,
	Converted,
	QualifierLoc);
	if (Name.isNull())
	return true;

	Arg = TemplateArgumentLoc(TemplateArgument(Name), QualifierLoc,
	TempParm->getDefaultArgument().getTemplateNameLoc());
	}

	// Introduce an instantiation record that describes where we are using
	// the default template argument. We're not actually instantiating a
	// template here, we just create this object to put a note into the
	// context stack.
	InstantiatingTemplate Inst(this, RAngleLoc, Template, Param, Converted,
	SourceRange(TemplateLoc, RAngleLoc));
	if (Inst.isInvalid())
	return true;

	// Check the default template argument.
	if (CheckTemplateArgument(*Param, Arg, Template, TemplateLoc,
	RAngleLoc, 0, Converted))
	return true;

	// Core issue 150 (assumed resolution): if this is a template template
	// parameter, keep track of the default template arguments from the
	// template definition.
	if (isTemplateTemplateParameter)
	NewArgs.addArgument(Arg);

	// Move to the next template parameter and argument.
	++Param;
	++ArgIdx;
	}

	// If we're performing a partial argument substitution, allow any trailing
	// pack expansions; they might be empty. This can happen even if
	// PartialTemplateArgs is false (the list of arguments is complete but
	// still dependent).
	if (ArgIdx < NumArgs && CurrentInstantiationScope &&
	CurrentInstantiationScope->getPartiallySubstitutedPack()) {
	while (ArgIdx < NumArgs && NewArgs[ArgIdx].getArgument().isPackExpansion())
	Converted.push_back(NewArgs[ArgIdx++].getArgument());
	}

	// If we have any leftover arguments, then there were too many arguments.
	// Complain and fail.
	if (ArgIdx < NumArgs) {
	Diag(TemplateLoc, diag::err_template_arg_list_different_arity)
	<< /too many args/1
	<< (int)getTemplateNameKindForDiagnostics(TemplateName(Template))
	<< Template
	<< SourceRange(NewArgs[ArgIdx].getLocation(), NewArgs.getRAngleLoc());
	Diag(Template->getLocation(), diag::note_template_decl_here)
	<< Params->getSourceRange();
	return true;
	}

	// No problems found with the new argument list, propagate changes back
	// to caller.
	if (UpdateArgsWithConversions)
	TemplateArgs = std::move(NewArgs);

	return false;
	}

	namespace {
	class UnnamedLocalNoLinkageFinder
	: public TypeVisitor<UnnamedLocalNoLinkageFinder, bool>
	{
	Sema &S;
	SourceRange SR;

	typedef TypeVisitor<UnnamedLocalNoLinkageFinder, bool> inherited;

	public:
	UnnamedLocalNoLinkageFinder(Sema &S, SourceRange SR) : S(S), SR(SR) { }

	bool Visit(QualType T) {
	return T.isNull() ? false : inherited::Visit(T.getTypePtr());
	}

	#define TYPE(Class, Parent) \
	bool Visit##Class##Type(const Class##Type *);
	#define ABSTRACT_TYPE(Class, Parent) \
	bool Visit##Class##Type(const Class##Type *) { return false; }
	#define NON_CANONICAL_TYPE(Class, Parent) \
	bool Visit##Class##Type(const Class##Type *) { return false; }
	#include "clang/AST/TypeNodes.def"

	bool VisitTagDecl(const TagDecl *Tag);
	bool VisitNestedNameSpecifier(NestedNameSpecifier *NNS);
	};
	} // end anonymous namespace

	bool UnnamedLocalNoLinkageFinder::VisitBuiltinType(const BuiltinType*) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitComplexType(const ComplexType* T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitPointerType(const PointerType* T) {
	return Visit(T->getPointeeType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitBlockPointerType(
	const BlockPointerType* T) {
	return Visit(T->getPointeeType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitLValueReferenceType(
	const LValueReferenceType* T) {
	return Visit(T->getPointeeType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitRValueReferenceType(
	const RValueReferenceType* T) {
	return Visit(T->getPointeeType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitMemberPointerType(
	const MemberPointerType* T) {
	return Visit(T->getPointeeType()) \|\| Visit(QualType(T->getClass(), 0));
	}

	bool UnnamedLocalNoLinkageFinder::VisitConstantArrayType(
	const ConstantArrayType* T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitIncompleteArrayType(
	const IncompleteArrayType* T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitVariableArrayType(
	const VariableArrayType* T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitDependentSizedArrayType(
	const DependentSizedArrayType* T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitDependentSizedExtVectorType(
	const DependentSizedExtVectorType* T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitDependentAddressSpaceType(
	const DependentAddressSpaceType *T) {
	return Visit(T->getPointeeType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitVectorType(const VectorType* T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitDependentVectorType(
	const DependentVectorType *T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitExtVectorType(const ExtVectorType* T) {
	return Visit(T->getElementType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitFunctionProtoType(
	const FunctionProtoType* T) {
	for (const auto &A : T->param_types()) {
	if (Visit(A))
	return true;
	}

	return Visit(T->getReturnType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitFunctionNoProtoType(
	const FunctionNoProtoType* T) {
	return Visit(T->getReturnType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitUnresolvedUsingType(
	const UnresolvedUsingType*) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitTypeOfExprType(const TypeOfExprType*) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitTypeOfType(const TypeOfType* T) {
	return Visit(T->getUnderlyingType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitDecltypeType(const DecltypeType*) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitUnaryTransformType(
	const UnaryTransformType*) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitAutoType(const AutoType *T) {
	return Visit(T->getDeducedType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitDeducedTemplateSpecializationType(
	const DeducedTemplateSpecializationType *T) {
	return Visit(T->getDeducedType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitRecordType(const RecordType* T) {
	return VisitTagDecl(T->getDecl());
	}

	bool UnnamedLocalNoLinkageFinder::VisitEnumType(const EnumType* T) {
	return VisitTagDecl(T->getDecl());
	}

	bool UnnamedLocalNoLinkageFinder::VisitTemplateTypeParmType(
	const TemplateTypeParmType*) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitSubstTemplateTypeParmPackType(
	const SubstTemplateTypeParmPackType *) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitTemplateSpecializationType(
	const TemplateSpecializationType*) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitInjectedClassNameType(
	const InjectedClassNameType* T) {
	return VisitTagDecl(T->getDecl());
	}

	bool UnnamedLocalNoLinkageFinder::VisitDependentNameType(
	const DependentNameType* T) {
	return VisitNestedNameSpecifier(T->getQualifier());
	}

	bool UnnamedLocalNoLinkageFinder::VisitDependentTemplateSpecializationType(
	const DependentTemplateSpecializationType* T) {
	return VisitNestedNameSpecifier(T->getQualifier());
	}

	bool UnnamedLocalNoLinkageFinder::VisitPackExpansionType(
	const PackExpansionType* T) {
	return Visit(T->getPattern());
	}

	bool UnnamedLocalNoLinkageFinder::VisitObjCObjectType(const ObjCObjectType *) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitObjCInterfaceType(
	const ObjCInterfaceType *) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitObjCObjectPointerType(
	const ObjCObjectPointerType *) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitAtomicType(const AtomicType* T) {
	return Visit(T->getValueType());
	}

	bool UnnamedLocalNoLinkageFinder::VisitPipeType(const PipeType* T) {
	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitTagDecl(const TagDecl *Tag) {
	if (Tag->getDeclContext()->isFunctionOrMethod()) {
	S.Diag(SR.getBegin(),
	S.getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_template_arg_local_type :
	diag::ext_template_arg_local_type)
	<< S.Context.getTypeDeclType(Tag) << SR;
	return true;
	}

	if (!Tag->hasNameForLinkage()) {
	S.Diag(SR.getBegin(),
	S.getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_template_arg_unnamed_type :
	diag::ext_template_arg_unnamed_type) << SR;
	S.Diag(Tag->getLocation(), diag::note_template_unnamed_type_here);
	return true;
	}

	return false;
	}

	bool UnnamedLocalNoLinkageFinder::VisitNestedNameSpecifier(
	NestedNameSpecifier *NNS) {
	if (NNS->getPrefix() && VisitNestedNameSpecifier(NNS->getPrefix()))
	return true;

	switch (NNS->getKind()) {
	case NestedNameSpecifier::Identifier:
	case NestedNameSpecifier::Namespace:
	case NestedNameSpecifier::NamespaceAlias:
	case NestedNameSpecifier::Global:
	case NestedNameSpecifier::Super:
	return false;

	case NestedNameSpecifier::TypeSpec:
	case NestedNameSpecifier::TypeSpecWithTemplate:
	return Visit(QualType(NNS->getAsType(), 0));
	}
	llvm_unreachable("Invalid NestedNameSpecifier::Kind!");
	}

	/// Check a template argument against its corresponding
	/// template type parameter.
	///
	/// This routine implements the semantics of C++ [temp.arg.type]. It
	/// returns true if an error occurred, and false otherwise.
	bool Sema::CheckTemplateArgument(TemplateTypeParmDecl *Param,
	TypeSourceInfo *ArgInfo) {
	assert(ArgInfo && "invalid TypeSourceInfo");
	QualType Arg = ArgInfo->getType();
	SourceRange SR = ArgInfo->getTypeLoc().getSourceRange();

	if (Arg->isVariablyModifiedType()) {
	return Diag(SR.getBegin(), diag::err_variably_modified_template_arg) << Arg;
	} else if (Context.hasSameUnqualifiedType(Arg, Context.OverloadTy)) {
	return Diag(SR.getBegin(), diag::err_template_arg_overload_type) << SR;
	}

	// C++03 [temp.arg.type]p2:
	// A local type, a type with no linkage, an unnamed type or a type
	// compounded from any of these types shall not be used as a
	// template-argument for a template type-parameter.
	//
	// C++11 allows these, and even in C++03 we allow them as an extension with
	// a warning.
	if (LangOpts.CPlusPlus11 \|\| Arg->hasUnnamedOrLocalType()) {
	UnnamedLocalNoLinkageFinder Finder(*this, SR);
	(void)Finder.Visit(Context.getCanonicalType(Arg));
	}

	return false;
	}

	enum NullPointerValueKind {
	NPV_NotNullPointer,
	NPV_NullPointer,
	NPV_Error
	};

	/// Determine whether the given template argument is a null pointer
	/// value of the appropriate type.
	static NullPointerValueKind
	isNullPointerValueTemplateArgument(Sema &S, NonTypeTemplateParmDecl *Param,
	QualType ParamType, Expr *Arg,
	Decl *Entity = nullptr) {
	if (Arg->isValueDependent() \|\| Arg->isTypeDependent())
	return NPV_NotNullPointer;

	// dllimport'd entities aren't constant but are available inside of template
	// arguments.
	if (Entity && Entity->hasAttr<DLLImportAttr>())
	return NPV_NotNullPointer;

	if (!S.isCompleteType(Arg->getExprLoc(), ParamType))
	llvm_unreachable(
	"Incomplete parameter type in isNullPointerValueTemplateArgument!");

	if (!S.getLangOpts().CPlusPlus11)
	return NPV_NotNullPointer;

	// Determine whether we have a constant expression.
	ExprResult ArgRV = S.DefaultFunctionArrayConversion(Arg);
	if (ArgRV.isInvalid())
	return NPV_Error;
	Arg = ArgRV.get();

	Expr::EvalResult EvalResult;
	SmallVector<PartialDiagnosticAt, 8> Notes;
	EvalResult.Diag = &Notes;
	if (!Arg->EvaluateAsRValue(EvalResult, S.Context) \|\|
	EvalResult.HasSideEffects) {
	SourceLocation DiagLoc = Arg->getExprLoc();

	// If our only note is the usual "invalid subexpression" note, just point
	// the caret at its location rather than producing an essentially
	// redundant note.
	if (Notes.size() == 1 && Notes[0].second.getDiagID() ==
	diag::note_invalid_subexpr_in_const_expr) {
	DiagLoc = Notes[0].first;
	Notes.clear();
	}

	S.Diag(DiagLoc, diag::err_template_arg_not_address_constant)
	<< Arg->getType() << Arg->getSourceRange();
	for (unsigned I = 0, N = Notes.size(); I != N; ++I)
	S.Diag(Notes[I].first, Notes[I].second);

	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return NPV_Error;
	}

	// C++11 [temp.arg.nontype]p1:
	// - an address constant expression of type std::nullptr_t
	if (Arg->getType()->isNullPtrType())
	return NPV_NullPointer;

	// - a constant expression that evaluates to a null pointer value (4.10); or
	// - a constant expression that evaluates to a null member pointer value
	// (4.11); or
	if ((EvalResult.Val.isLValue() && !EvalResult.Val.getLValueBase()) \|\|
	(EvalResult.Val.isMemberPointer() &&
	!EvalResult.Val.getMemberPointerDecl())) {
	// If our expression has an appropriate type, we've succeeded.
	bool ObjCLifetimeConversion;
	if (S.Context.hasSameUnqualifiedType(Arg->getType(), ParamType) \|\|
	S.IsQualificationConversion(Arg->getType(), ParamType, false,
	ObjCLifetimeConversion))
	return NPV_NullPointer;

	// The types didn't match, but we know we got a null pointer; complain,
	// then recover as if the types were correct.
	S.Diag(Arg->getExprLoc(), diag::err_template_arg_wrongtype_null_constant)
	<< Arg->getType() << ParamType << Arg->getSourceRange();
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return NPV_NullPointer;
	}

	// If we don't have a null pointer value, but we do have a NULL pointer
	// constant, suggest a cast to the appropriate type.
	if (Arg->isNullPointerConstant(S.Context, Expr::NPC_NeverValueDependent)) {
	std::string Code = "static_cast<" + ParamType.getAsString() + ">(";
	S.Diag(Arg->getExprLoc(), diag::err_template_arg_untyped_null_constant)
	<< ParamType << FixItHint::CreateInsertion(Arg->getBeginLoc(), Code)
	<< FixItHint::CreateInsertion(S.getLocForEndOfToken(Arg->getEndLoc()),
	")");
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return NPV_NullPointer;
	}

	// FIXME: If we ever want to support general, address-constant expressions
	// as non-type template arguments, we should return the ExprResult here to
	// be interpreted by the caller.
	return NPV_NotNullPointer;
	}

	/// Checks whether the given template argument is compatible with its
	/// template parameter.
	static bool CheckTemplateArgumentIsCompatibleWithParameter(
	Sema &S, NonTypeTemplateParmDecl Param, QualType ParamType, Expr ArgIn,
	Expr *Arg, QualType ArgType) {
	bool ObjCLifetimeConversion;
	if (ParamType->isPointerType() &&
	!ParamType->getAs<PointerType>()->getPointeeType()->isFunctionType() &&
	S.IsQualificationConversion(ArgType, ParamType, false,
	ObjCLifetimeConversion)) {
	// For pointer-to-object types, qualification conversions are
	// permitted.
	} else {
	if (const ReferenceType *ParamRef = ParamType->getAs<ReferenceType>()) {
	if (!ParamRef->getPointeeType()->isFunctionType()) {
	// C++ [temp.arg.nontype]p5b3:
	// For a non-type template-parameter of type reference to
	// object, no conversions apply. The type referred to by the
	// reference may be more cv-qualified than the (otherwise
	// identical) type of the template- argument. The
	// template-parameter is bound directly to the
	// template-argument, which shall be an lvalue.

	// FIXME: Other qualifiers?
	unsigned ParamQuals = ParamRef->getPointeeType().getCVRQualifiers();
	unsigned ArgQuals = ArgType.getCVRQualifiers();

	if ((ParamQuals \| ArgQuals) != ParamQuals) {
	S.Diag(Arg->getBeginLoc(),
	diag::err_template_arg_ref_bind_ignores_quals)
	<< ParamType << Arg->getType() << Arg->getSourceRange();
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}
	}
	}

	// At this point, the template argument refers to an object or
	// function with external linkage. We now need to check whether the
	// argument and parameter types are compatible.
	if (!S.Context.hasSameUnqualifiedType(ArgType,
	ParamType.getNonReferenceType())) {
	// We can't perform this conversion or binding.
	if (ParamType->isReferenceType())
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_no_ref_bind)
	<< ParamType << ArgIn->getType() << Arg->getSourceRange();
	else
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_not_convertible)
	<< ArgIn->getType() << ParamType << Arg->getSourceRange();
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}
	}

	return false;
	}

	/// Checks whether the given template argument is the address
	/// of an object or function according to C++ [temp.arg.nontype]p1.
	static bool
	CheckTemplateArgumentAddressOfObjectOrFunction(Sema &S,
	NonTypeTemplateParmDecl *Param,
	QualType ParamType,
	Expr *ArgIn,
	TemplateArgument &Converted) {
	bool Invalid = false;
	Expr *Arg = ArgIn;
	QualType ArgType = Arg->getType();

	bool AddressTaken = false;
	SourceLocation AddrOpLoc;
	if (S.getLangOpts().MicrosoftExt) {
	// Microsoft Visual C++ strips all casts, allows an arbitrary number of
	// dereference and address-of operators.
	Arg = Arg->IgnoreParenCasts();

	bool ExtWarnMSTemplateArg = false;
	UnaryOperatorKind FirstOpKind;
	SourceLocation FirstOpLoc;
	while (UnaryOperator *UnOp = dyn_cast<UnaryOperator>(Arg)) {
	UnaryOperatorKind UnOpKind = UnOp->getOpcode();
	if (UnOpKind == UO_Deref)
	ExtWarnMSTemplateArg = true;
	if (UnOpKind == UO_AddrOf \|\| UnOpKind == UO_Deref) {
	Arg = UnOp->getSubExpr()->IgnoreParenCasts();
	if (!AddrOpLoc.isValid()) {
	FirstOpKind = UnOpKind;
	FirstOpLoc = UnOp->getOperatorLoc();
	}
	} else
	break;
	}
	if (FirstOpLoc.isValid()) {
	if (ExtWarnMSTemplateArg)
	S.Diag(ArgIn->getBeginLoc(), diag::ext_ms_deref_template_argument)
	<< ArgIn->getSourceRange();

	if (FirstOpKind == UO_AddrOf)
	AddressTaken = true;
	else if (Arg->getType()->isPointerType()) {
	// We cannot let pointers get dereferenced here, that is obviously not a
	// constant expression.
	assert(FirstOpKind == UO_Deref);
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_not_decl_ref)
	<< Arg->getSourceRange();
	}
	}
	} else {
	// See through any implicit casts we added to fix the type.
	Arg = Arg->IgnoreImpCasts();

	// C++ [temp.arg.nontype]p1:
	//
	// A template-argument for a non-type, non-template
	// template-parameter shall be one of: [...]
	//
	// -- the address of an object or function with external
	// linkage, including function templates and function
	// template-ids but excluding non-static class members,
	// expressed as & id-expression where the & is optional if
	// the name refers to a function or array, or if the
	// corresponding template-parameter is a reference; or

	// In C++98/03 mode, give an extension warning on any extra parentheses.
	// See http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#773
	bool ExtraParens = false;
	while (ParenExpr *Parens = dyn_cast<ParenExpr>(Arg)) {
	if (!Invalid && !ExtraParens) {
	S.Diag(Arg->getBeginLoc(),
	S.getLangOpts().CPlusPlus11
	? diag::warn_cxx98_compat_template_arg_extra_parens
	: diag::ext_template_arg_extra_parens)
	<< Arg->getSourceRange();
	ExtraParens = true;
	}

	Arg = Parens->getSubExpr();
	}

	while (SubstNonTypeTemplateParmExpr *subst =
	dyn_cast<SubstNonTypeTemplateParmExpr>(Arg))
	Arg = subst->getReplacement()->IgnoreImpCasts();

	if (UnaryOperator *UnOp = dyn_cast<UnaryOperator>(Arg)) {
	if (UnOp->getOpcode() == UO_AddrOf) {
	Arg = UnOp->getSubExpr();
	AddressTaken = true;
	AddrOpLoc = UnOp->getOperatorLoc();
	}
	}

	while (SubstNonTypeTemplateParmExpr *subst =
	dyn_cast<SubstNonTypeTemplateParmExpr>(Arg))
	Arg = subst->getReplacement()->IgnoreImpCasts();
	}

	DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(Arg);
	ValueDecl *Entity = DRE ? DRE->getDecl() : nullptr;

	// If our parameter has pointer type, check for a null template value.
	if (ParamType->isPointerType() \|\| ParamType->isNullPtrType()) {
	switch (isNullPointerValueTemplateArgument(S, Param, ParamType, ArgIn,
	Entity)) {
	case NPV_NullPointer:
	S.Diag(Arg->getExprLoc(), diag::warn_cxx98_compat_template_arg_null);
	Converted = TemplateArgument(S.Context.getCanonicalType(ParamType),
	/isNullPtr=/true);
	return false;

	case NPV_Error:
	return true;

	case NPV_NotNullPointer:
	break;
	}
	}

	// Stop checking the precise nature of the argument if it is value dependent,
	// it should be checked when instantiated.
	if (Arg->isValueDependent()) {
	Converted = TemplateArgument(ArgIn);
	return false;
	}

	if (isa<CXXUuidofExpr>(Arg)) {
	if (CheckTemplateArgumentIsCompatibleWithParameter(S, Param, ParamType,
	ArgIn, Arg, ArgType))
	return true;

	Converted = TemplateArgument(ArgIn);
	return false;
	}

	if (!DRE) {
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_not_decl_ref)
	<< Arg->getSourceRange();
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}

	// Cannot refer to non-static data members
	if (isa<FieldDecl>(Entity) \|\| isa<IndirectFieldDecl>(Entity)) {
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_field)
	<< Entity << Arg->getSourceRange();
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}

	// Cannot refer to non-static member functions
	if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(Entity)) {
	if (!Method->isStatic()) {
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_method)
	<< Method << Arg->getSourceRange();
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}
	}

	FunctionDecl *Func = dyn_cast<FunctionDecl>(Entity);
	VarDecl *Var = dyn_cast<VarDecl>(Entity);

	// A non-type template argument must refer to an object or function.
	if (!Func && !Var) {
	// We found something, but we don't know specifically what it is.
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_not_object_or_func)
	<< Arg->getSourceRange();
	S.Diag(DRE->getDecl()->getLocation(), diag::note_template_arg_refers_here);
	return true;
	}

	// Address / reference template args must have external linkage in C++98.
	if (Entity->getFormalLinkage() == InternalLinkage) {
	S.Diag(Arg->getBeginLoc(),
	S.getLangOpts().CPlusPlus11
	? diag::warn_cxx98_compat_template_arg_object_internal
	: diag::ext_template_arg_object_internal)
	<< !Func << Entity << Arg->getSourceRange();
	S.Diag(Entity->getLocation(), diag::note_template_arg_internal_object)
	<< !Func;
	} else if (!Entity->hasLinkage()) {
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_object_no_linkage)
	<< !Func << Entity << Arg->getSourceRange();
	S.Diag(Entity->getLocation(), diag::note_template_arg_internal_object)
	<< !Func;
	return true;
	}

	if (Func) {
	// If the template parameter has pointer type, the function decays.
	if (ParamType->isPointerType() && !AddressTaken)
	ArgType = S.Context.getPointerType(Func->getType());
	else if (AddressTaken && ParamType->isReferenceType()) {
	// If we originally had an address-of operator, but the
	// parameter has reference type, complain and (if things look
	// like they will work) drop the address-of operator.
	if (!S.Context.hasSameUnqualifiedType(Func->getType(),
	ParamType.getNonReferenceType())) {
	S.Diag(AddrOpLoc, diag::err_template_arg_address_of_non_pointer)
	<< ParamType;
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}

	S.Diag(AddrOpLoc, diag::err_template_arg_address_of_non_pointer)
	<< ParamType
	<< FixItHint::CreateRemoval(AddrOpLoc);
	S.Diag(Param->getLocation(), diag::note_template_param_here);

	ArgType = Func->getType();
	}
	} else {
	// A value of reference type is not an object.
	if (Var->getType()->isReferenceType()) {
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_reference_var)
	<< Var->getType() << Arg->getSourceRange();
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}

	// A template argument must have static storage duration.
	if (Var->getTLSKind()) {
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_thread_local)
	<< Arg->getSourceRange();
	S.Diag(Var->getLocation(), diag::note_template_arg_refers_here);
	return true;
	}

	// If the template parameter has pointer type, we must have taken
	// the address of this object.
	if (ParamType->isReferenceType()) {
	if (AddressTaken) {
	// If we originally had an address-of operator, but the
	// parameter has reference type, complain and (if things look
	// like they will work) drop the address-of operator.
	if (!S.Context.hasSameUnqualifiedType(Var->getType(),
	ParamType.getNonReferenceType())) {
	S.Diag(AddrOpLoc, diag::err_template_arg_address_of_non_pointer)
	<< ParamType;
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}

	S.Diag(AddrOpLoc, diag::err_template_arg_address_of_non_pointer)
	<< ParamType
	<< FixItHint::CreateRemoval(AddrOpLoc);
	S.Diag(Param->getLocation(), diag::note_template_param_here);

	ArgType = Var->getType();
	}
	} else if (!AddressTaken && ParamType->isPointerType()) {
	if (Var->getType()->isArrayType()) {
	// Array-to-pointer decay.
	ArgType = S.Context.getArrayDecayedType(Var->getType());
	} else {
	// If the template parameter has pointer type but the address of
	// this object was not taken, complain and (possibly) recover by
	// taking the address of the entity.
	ArgType = S.Context.getPointerType(Var->getType());
	if (!S.Context.hasSameUnqualifiedType(ArgType, ParamType)) {
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_not_address_of)
	<< ParamType;
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}

	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_not_address_of)
	<< ParamType << FixItHint::CreateInsertion(Arg->getBeginLoc(), "&");

	S.Diag(Param->getLocation(), diag::note_template_param_here);
	}
	}
	}

	if (CheckTemplateArgumentIsCompatibleWithParameter(S, Param, ParamType, ArgIn,
	Arg, ArgType))
	return true;

	// Create the template argument.
	Converted =
	TemplateArgument(cast<ValueDecl>(Entity->getCanonicalDecl()), ParamType);
	S.MarkAnyDeclReferenced(Arg->getBeginLoc(), Entity, false);
	return false;
	}

	/// Checks whether the given template argument is a pointer to
	/// member constant according to C++ [temp.arg.nontype]p1.
	static bool CheckTemplateArgumentPointerToMember(Sema &S,
	NonTypeTemplateParmDecl *Param,
	QualType ParamType,
	Expr *&ResultArg,
	TemplateArgument &Converted) {
	bool Invalid = false;

	Expr *Arg = ResultArg;
	bool ObjCLifetimeConversion;

	// C++ [temp.arg.nontype]p1:
	//
	// A template-argument for a non-type, non-template
	// template-parameter shall be one of: [...]
	//
	// -- a pointer to member expressed as described in 5.3.1.
	DeclRefExpr *DRE = nullptr;

	// In C++98/03 mode, give an extension warning on any extra parentheses.
	// See http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#773
	bool ExtraParens = false;
	while (ParenExpr *Parens = dyn_cast<ParenExpr>(Arg)) {
	if (!Invalid && !ExtraParens) {
	S.Diag(Arg->getBeginLoc(),
	S.getLangOpts().CPlusPlus11
	? diag::warn_cxx98_compat_template_arg_extra_parens
	: diag::ext_template_arg_extra_parens)
	<< Arg->getSourceRange();
	ExtraParens = true;
	}

	Arg = Parens->getSubExpr();
	}

	while (SubstNonTypeTemplateParmExpr *subst =
	dyn_cast<SubstNonTypeTemplateParmExpr>(Arg))
	Arg = subst->getReplacement()->IgnoreImpCasts();

	// A pointer-to-member constant written &Class::member.
	if (UnaryOperator *UnOp = dyn_cast<UnaryOperator>(Arg)) {
	if (UnOp->getOpcode() == UO_AddrOf) {
	DRE = dyn_cast<DeclRefExpr>(UnOp->getSubExpr());
	if (DRE && !DRE->getQualifier())
	DRE = nullptr;
	}
	}
	// A constant of pointer-to-member type.
	else if ((DRE = dyn_cast<DeclRefExpr>(Arg))) {
	ValueDecl *VD = DRE->getDecl();
	if (VD->getType()->isMemberPointerType()) {
	if (isa<NonTypeTemplateParmDecl>(VD)) {
	if (Arg->isTypeDependent() \|\| Arg->isValueDependent()) {
	Converted = TemplateArgument(Arg);
	} else {
	VD = cast<ValueDecl>(VD->getCanonicalDecl());
	Converted = TemplateArgument(VD, ParamType);
	}
	return Invalid;
	}
	}

	DRE = nullptr;
	}

	ValueDecl *Entity = DRE ? DRE->getDecl() : nullptr;

	// Check for a null pointer value.
	switch (isNullPointerValueTemplateArgument(S, Param, ParamType, ResultArg,
	Entity)) {
	case NPV_Error:
	return true;
	case NPV_NullPointer:
	S.Diag(ResultArg->getExprLoc(), diag::warn_cxx98_compat_template_arg_null);
	Converted = TemplateArgument(S.Context.getCanonicalType(ParamType),
	/isNullPtr/true);
	return false;
	case NPV_NotNullPointer:
	break;
	}

	if (S.IsQualificationConversion(ResultArg->getType(),
	ParamType.getNonReferenceType(), false,
	ObjCLifetimeConversion)) {
	ResultArg = S.ImpCastExprToType(ResultArg, ParamType, CK_NoOp,
	ResultArg->getValueKind())
	.get();
	} else if (!S.Context.hasSameUnqualifiedType(
	ResultArg->getType(), ParamType.getNonReferenceType())) {
	// We can't perform this conversion.
	S.Diag(ResultArg->getBeginLoc(), diag::err_template_arg_not_convertible)
	<< ResultArg->getType() << ParamType << ResultArg->getSourceRange();
	S.Diag(Param->getLocation(), diag::note_template_param_here);
	return true;
	}

	if (!DRE)
	return S.Diag(Arg->getBeginLoc(),
	diag::err_template_arg_not_pointer_to_member_form)
	<< Arg->getSourceRange();

	if (isa<FieldDecl>(DRE->getDecl()) \|\|
	isa<IndirectFieldDecl>(DRE->getDecl()) \|\|
	isa<CXXMethodDecl>(DRE->getDecl())) {
	assert((isa<FieldDecl>(DRE->getDecl()) \|\|
	isa<IndirectFieldDecl>(DRE->getDecl()) \|\|
	!cast<CXXMethodDecl>(DRE->getDecl())->isStatic()) &&
	"Only non-static member pointers can make it here");

	// Okay: this is the address of a non-static member, and therefore
	// a member pointer constant.
	if (Arg->isTypeDependent() \|\| Arg->isValueDependent()) {
	Converted = TemplateArgument(Arg);
	} else {
	ValueDecl *D = cast<ValueDecl>(DRE->getDecl()->getCanonicalDecl());
	Converted = TemplateArgument(D, ParamType);
	}
	return Invalid;
	}

	// We found something else, but we don't know specifically what it is.
	S.Diag(Arg->getBeginLoc(), diag::err_template_arg_not_pointer_to_member_form)
	<< Arg->getSourceRange();
	S.Diag(DRE->getDecl()->getLocation(), diag::note_template_arg_refers_here);
	return true;
	}

	/// Check a template argument against its corresponding
	/// non-type template parameter.
	///
	/// This routine implements the semantics of C++ [temp.arg.nontype].
	/// If an error occurred, it returns ExprError(); otherwise, it
	/// returns the converted template argument. \p ParamType is the
	/// type of the non-type template parameter after it has been instantiated.
	ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
	QualType ParamType, Expr *Arg,
	TemplateArgument &Converted,
	CheckTemplateArgumentKind CTAK) {
	SourceLocation StartLoc = Arg->getBeginLoc();

	// If the parameter type somehow involves auto, deduce the type now.
	if (getLangOpts().CPlusPlus17 && ParamType->isUndeducedType()) {
	// During template argument deduction, we allow 'decltype(auto)' to
	// match an arbitrary dependent argument.
	// FIXME: The language rules don't say what happens in this case.
	// FIXME: We get an opaque dependent type out of decltype(auto) if the
	// expression is merely instantiation-dependent; is this enough?
	if (CTAK == CTAK_Deduced && Arg->isTypeDependent()) {
	auto *AT = dyn_cast<AutoType>(ParamType);
	if (AT && AT->isDecltypeAuto()) {
	Converted = TemplateArgument(Arg);
	return Arg;
	}
	}

	// When checking a deduced template argument, deduce from its type even if
	// the type is dependent, in order to check the types of non-type template
	// arguments line up properly in partial ordering.
	Optional<unsigned> Depth = Param->getDepth() + 1;
	Expr *DeductionArg = Arg;
	if (auto *PE = dyn_cast<PackExpansionExpr>(DeductionArg))
	DeductionArg = PE->getPattern();
	if (DeduceAutoType(
	Context.getTrivialTypeSourceInfo(ParamType, Param->getLocation()),
	DeductionArg, ParamType, Depth) == DAR_Failed) {
	Diag(Arg->getExprLoc(),
	diag::err_non_type_template_parm_type_deduction_failure)
	<< Param->getDeclName() << Param->getType() << Arg->getType()
	<< Arg->getSourceRange();
	Diag(Param->getLocation(), diag::note_template_param_here);
	return ExprError();
	}
	// CheckNonTypeTemplateParameterType will produce a diagnostic if there's
	// an error. The error message normally references the parameter
	// declaration, but here we'll pass the argument location because that's
	// where the parameter type is deduced.
	ParamType = CheckNonTypeTemplateParameterType(ParamType, Arg->getExprLoc());
	if (ParamType.isNull()) {
	Diag(Param->getLocation(), diag::note_template_param_here);
	return ExprError();
	}
	}

	// We should have already dropped all cv-qualifiers by now.
	assert(!ParamType.hasQualifiers() &&
	"non-type template parameter type cannot be qualified");

	if (CTAK == CTAK_Deduced &&
	!Context.hasSameType(ParamType.getNonLValueExprType(Context),
	Arg->getType())) {
	// FIXME: If either type is dependent, we skip the check. This isn't
	// correct, since during deduction we're supposed to have replaced each
	// template parameter with some unique (non-dependent) placeholder.
	// FIXME: If the argument type contains 'auto', we carry on and fail the
	// type check in order to force specific types to be more specialized than
	// 'auto'. It's not clear how partial ordering with 'auto' is supposed to
	// work.
	if ((ParamType->isDependentType() \|\| Arg->isTypeDependent()) &&
	!Arg->getType()->getContainedAutoType()) {
	Converted = TemplateArgument(Arg);
	return Arg;
	}
	// FIXME: This attempts to implement C++ [temp.deduct.type]p17. Per DR1770,
	// we should actually be checking the type of the template argument in P,
	// not the type of the template argument deduced from A, against the
	// template parameter type.
	Diag(StartLoc, diag::err_deduced_non_type_template_arg_type_mismatch)
	<< Arg->getType()
	<< ParamType.getUnqualifiedType();
	Diag(Param->getLocation(), diag::note_template_param_here);
	return ExprError();
	}

	// If either the parameter has a dependent type or the argument is
	// type-dependent, there's nothing we can check now.
	if (ParamType->isDependentType() \|\| Arg->isTypeDependent()) {
	// Force the argument to the type of the parameter to maintain invariants.
	auto *PE = dyn_cast<PackExpansionExpr>(Arg);
	if (PE)
	Arg = PE->getPattern();
	ExprResult E = ImpCastExprToType(
	Arg, ParamType.getNonLValueExprType(Context), CK_Dependent,
	ParamType->isLValueReferenceType() ? VK_LValue :
	ParamType->isRValueReferenceType() ? VK_XValue : VK_RValue);
	if (E.isInvalid())
	return ExprError();
	if (PE) {
	// Recreate a pack expansion if we unwrapped one.
	E = new (Context)
	PackExpansionExpr(E.get()->getType(), E.get(), PE->getEllipsisLoc(),
	PE->getNumExpansions());
	}
	Converted = TemplateArgument(E.get());
	return E;
	}

	// The initialization of the parameter from the argument is
	// a constant-evaluated context.
	EnterExpressionEvaluationContext ConstantEvaluated(
	*this, Sema::ExpressionEvaluationContext::ConstantEvaluated);

	if (getLangOpts().CPlusPlus17) {
	// C++17 [temp.arg.nontype]p1:
	// A template-argument for a non-type template parameter shall be
	// a converted constant expression of the type of the template-parameter.
	APValue Value;
	ExprResult ArgResult = CheckConvertedConstantExpression(
	Arg, ParamType, Value, CCEK_TemplateArg);
	if (ArgResult.isInvalid())
	return ExprError();

	// For a value-dependent argument, CheckConvertedConstantExpression is
	// permitted (and expected) to be unable to determine a value.
	if (ArgResult.get()->isValueDependent()) {
	Converted = TemplateArgument(ArgResult.get());
	return ArgResult;
	}

	QualType CanonParamType = Context.getCanonicalType(ParamType);

	// Convert the APValue to a TemplateArgument.
	switch (Value.getKind()) {
	case APValue::None:
	assert(ParamType->isNullPtrType());
	Converted = TemplateArgument(CanonParamType, /isNullPtr/true);
	break;
	case APValue::Indeterminate:
	llvm_unreachable("result of constant evaluation should be initialized");
	break;
	case APValue::Int:
	assert(ParamType->isIntegralOrEnumerationType());
	Converted = TemplateArgument(Context, Value.getInt(), CanonParamType);
	break;
	case APValue::MemberPointer: {
	assert(ParamType->isMemberPointerType());

	// FIXME: We need TemplateArgument representation and mangling for these.
	if (!Value.getMemberPointerPath().empty()) {
	Diag(Arg->getBeginLoc(),
	diag::err_template_arg_member_ptr_base_derived_not_supported)
	<< Value.getMemberPointerDecl() << ParamType
	<< Arg->getSourceRange();
	return ExprError();
	}

	auto VD = const_cast<ValueDecl>(Value.getMemberPointerDecl());
	Converted = VD ? TemplateArgument(VD, CanonParamType)
	: TemplateArgument(CanonParamType, /isNullPtr/true);
	break;
	}
	case APValue::LValue: {
	// For a non-type template-parameter of pointer or reference type,
	// the value of the constant expression shall not refer to
	assert(ParamType->isPointerType() \|\| ParamType->isReferenceType() \|\|
	ParamType->isNullPtrType());
	// -- a temporary object
	// -- a string literal
	// -- the result of a typeid expression, or
	// -- a predefined __func__ variable
	APValue::LValueBase Base = Value.getLValueBase();
	auto VD = const_cast<ValueDecl >(Base.dyn_cast<const ValueDecl *>());
	if (Base && !VD) {
	auto E = Base.dyn_cast<const Expr >();
	if (E && isa<CXXUuidofExpr>(E)) {
	Converted = TemplateArgument(ArgResult.get()->IgnoreImpCasts());
	break;
	}
	Diag(Arg->getBeginLoc(), diag::err_template_arg_not_decl_ref)
	<< Arg->getSourceRange();
	return ExprError();
	}
	// -- a subobject
	if (Value.hasLValuePath() && Value.getLValuePath().size() == 1 &&
	VD && VD->getType()->isArrayType() &&
	Value.getLValuePath()[0].getAsArrayIndex() == 0 &&
	!Value.isLValueOnePastTheEnd() && ParamType->isPointerType()) {
	// Per defect report (no number yet):
	// ... other than a pointer to the first element of a complete array
	// object.
	} else if (!Value.hasLValuePath() \|\| Value.getLValuePath().size() \|\|
	Value.isLValueOnePastTheEnd()) {
	Diag(StartLoc, diag::err_non_type_template_arg_subobject)
	<< Value.getAsString(Context, ParamType);
	return ExprError();
	}
	assert((VD \|\| !ParamType->isReferenceType()) &&
	"null reference should not be a constant expression");
	assert((!VD \|\| !ParamType->isNullPtrType()) &&
	"non-null value of type nullptr_t?");
	Converted = VD ? TemplateArgument(VD, CanonParamType)
	: TemplateArgument(CanonParamType, /isNullPtr/true);
	break;
	}
	case APValue::AddrLabelDiff:
	return Diag(StartLoc, diag::err_non_type_template_arg_addr_label_diff);
	case APValue::FixedPoint:
	case APValue::Float:
	case APValue::ComplexInt:
	case APValue::ComplexFloat:
	case APValue::Vector:
	case APValue::Array:
	case APValue::Struct:
	case APValue::Union:
	llvm_unreachable("invalid kind for template argument");
	}

	return ArgResult.get();
	}

	// C++ [temp.arg.nontype]p5:
	// The following conversions are performed on each expression used
	// as a non-type template-argument. If a non-type
	// template-argument cannot be converted to the type of the
	// corresponding template-parameter then the program is
	// ill-formed.
	if (ParamType->isIntegralOrEnumerationType()) {
	// C++11:
	// -- for a non-type template-parameter of integral or
	// enumeration type, conversions permitted in a converted
	// constant expression are applied.
	//
	// C++98:
	// -- for a non-type template-parameter of integral or
	// enumeration type, integral promotions (4.5) and integral
	// conversions (4.7) are applied.

	if (getLangOpts().CPlusPlus11) {
	// C++ [temp.arg.nontype]p1:
	// A template-argument for a non-type, non-template template-parameter
	// shall be one of:
	//
	// -- for a non-type template-parameter of integral or enumeration
	// type, a converted constant expression of the type of the
	// template-parameter; or
	llvm::APSInt Value;
	ExprResult ArgResult =
	CheckConvertedConstantExpression(Arg, ParamType, Value,
	CCEK_TemplateArg);
	if (ArgResult.isInvalid())
	return ExprError();

	// We can't check arbitrary value-dependent arguments.
	if (ArgResult.get()->isValueDependent()) {
	Converted = TemplateArgument(ArgResult.get());
	return ArgResult;
	}

	// Widen the argument value to sizeof(parameter type). This is almost
	// always a no-op, except when the parameter type is bool. In
	// that case, this may extend the argument from 1 bit to 8 bits.
	QualType IntegerType = ParamType;
	if (const EnumType *Enum = IntegerType->getAs<EnumType>())
	IntegerType = Enum->getDecl()->getIntegerType();
	Value = Value.extOrTrunc(Context.getTypeSize(IntegerType));

	Converted = TemplateArgument(Context, Value,
	Context.getCanonicalType(ParamType));
	return ArgResult;
	}

	ExprResult ArgResult = DefaultLvalueConversion(Arg);
	if (ArgResult.isInvalid())
	return ExprError();
	Arg = ArgResult.get();

	QualType ArgType = Arg->getType();

	// C++ [temp.arg.nontype]p1:
	// A template-argument for a non-type, non-template
	// template-parameter shall be one of:
	//
	// -- an integral constant-expression of integral or enumeration
	// type; or
	// -- the name of a non-type template-parameter; or
	llvm::APSInt Value;
	if (!ArgType->isIntegralOrEnumerationType()) {
	Diag(Arg->getBeginLoc(), diag::err_template_arg_not_integral_or_enumeral)
	<< ArgType << Arg->getSourceRange();
	Diag(Param->getLocation(), diag::note_template_param_here);
	return ExprError();
	} else if (!Arg->isValueDependent()) {
	class TmplArgICEDiagnoser : public VerifyICEDiagnoser {
	QualType T;

	public:
	TmplArgICEDiagnoser(QualType T) : T(T) { }

	void diagnoseNotICE(Sema &S, SourceLocation Loc,
	SourceRange SR) override {
	S.Diag(Loc, diag::err_template_arg_not_ice) << T << SR;
	}
	} Diagnoser(ArgType);

	Arg = VerifyIntegerConstantExpression(Arg, &Value, Diagnoser,
	false).get();
	if (!Arg)
	return ExprError();
	}

	// From here on out, all we care about is the unqualified form
	// of the argument type.
	ArgType = ArgType.getUnqualifiedType();

	// Try to convert the argument to the parameter's type.
	if (Context.hasSameType(ParamType, ArgType)) {
	// Okay: no conversion necessary
	} else if (ParamType->isBooleanType()) {
	// This is an integral-to-boolean conversion.
	Arg = ImpCastExprToType(Arg, ParamType, CK_IntegralToBoolean).get();
	} else if (IsIntegralPromotion(Arg, ArgType, ParamType) \|\|
	!ParamType->isEnumeralType()) {
	// This is an integral promotion or conversion.
	Arg = ImpCastExprToType(Arg, ParamType, CK_IntegralCast).get();
	} else {
	// We can't perform this conversion.
	Diag(Arg->getBeginLoc(), diag::err_template_arg_not_convertible)
	<< Arg->getType() << ParamType << Arg->getSourceRange();
	Diag(Param->getLocation(), diag::note_template_param_here);
	return ExprError();
	}

	// Add the value of this argument to the list of converted
	// arguments. We use the bitwidth and signedness of the template
	// parameter.
	if (Arg->isValueDependent()) {
	// The argument is value-dependent. Create a new
	// TemplateArgument with the converted expression.
	Converted = TemplateArgument(Arg);
	return Arg;
	}

	QualType IntegerType = Context.getCanonicalType(ParamType);
	if (const EnumType *Enum = IntegerType->getAs<EnumType>())
	IntegerType = Context.getCanonicalType(Enum->getDecl()->getIntegerType());

	if (ParamType->isBooleanType()) {
	// Value must be zero or one.
	Value = Value != 0;
	unsigned AllowedBits = Context.getTypeSize(IntegerType);
	if (Value.getBitWidth() != AllowedBits)
	Value = Value.extOrTrunc(AllowedBits);
	Value.setIsSigned(IntegerType->isSignedIntegerOrEnumerationType());
	} else {
	llvm::APSInt OldValue = Value;

	// Coerce the template argument's value to the value it will have
	// based on the template parameter's type.
	unsigned AllowedBits = Context.getTypeSize(IntegerType);
	if (Value.getBitWidth() != AllowedBits)
	Value = Value.extOrTrunc(AllowedBits);
	Value.setIsSigned(IntegerType->isSignedIntegerOrEnumerationType());

	// Complain if an unsigned parameter received a negative value.
	if (IntegerType->isUnsignedIntegerOrEnumerationType()
	&& (OldValue.isSigned() && OldValue.isNegative())) {
	Diag(Arg->getBeginLoc(), diag::warn_template_arg_negative)
	<< OldValue.toString(10) << Value.toString(10) << Param->getType()
	<< Arg->getSourceRange();
	Diag(Param->getLocation(), diag::note_template_param_here);
	}

	// Complain if we overflowed the template parameter's type.
	unsigned RequiredBits;
	if (IntegerType->isUnsignedIntegerOrEnumerationType())
	RequiredBits = OldValue.getActiveBits();
	else if (OldValue.isUnsigned())
	RequiredBits = OldValue.getActiveBits() + 1;
	else
	RequiredBits = OldValue.getMinSignedBits();
	if (RequiredBits > AllowedBits) {
	Diag(Arg->getBeginLoc(), diag::warn_template_arg_too_large)
	<< OldValue.toString(10) << Value.toString(10) << Param->getType()
	<< Arg->getSourceRange();
	Diag(Param->getLocation(), diag::note_template_param_here);
	}
	}

	Converted = TemplateArgument(Context, Value,
	ParamType->isEnumeralType()
	? Context.getCanonicalType(ParamType)
	: IntegerType);
	return Arg;
	}

	QualType ArgType = Arg->getType();
	DeclAccessPair FoundResult; // temporary for ResolveOverloadedFunction

	// Handle pointer-to-function, reference-to-function, and
	// pointer-to-member-function all in (roughly) the same way.
	if (// -- For a non-type template-parameter of type pointer to
	// function, only the function-to-pointer conversion (4.3) is
	// applied. If the template-argument represents a set of
	// overloaded functions (or a pointer to such), the matching
	// function is selected from the set (13.4).
	(ParamType->isPointerType() &&
	ParamType->getAs<PointerType>()->getPointeeType()->isFunctionType()) \|\|
	// -- For a non-type template-parameter of type reference to
	// function, no conversions apply. If the template-argument
	// represents a set of overloaded functions, the matching
	// function is selected from the set (13.4).
	(ParamType->isReferenceType() &&
	ParamType->getAs<ReferenceType>()->getPointeeType()->isFunctionType()) \|\|
	// -- For a non-type template-parameter of type pointer to
	// member function, no conversions apply. If the
	// template-argument represents a set of overloaded member
	// functions, the matching member function is selected from
	// the set (13.4).
	(ParamType->isMemberPointerType() &&
	ParamType->getAs<MemberPointerType>()->getPointeeType()
	->isFunctionType())) {

	if (Arg->getType() == Context.OverloadTy) {
	if (FunctionDecl *Fn = ResolveAddressOfOverloadedFunction(Arg, ParamType,
	true,
	FoundResult)) {
	if (DiagnoseUseOfDecl(Fn, Arg->getBeginLoc()))
	return ExprError();

	Arg = FixOverloadedFunctionReference(Arg, FoundResult, Fn);
	ArgType = Arg->getType();
	} else
	return ExprError();
	}

	if (!ParamType->isMemberPointerType()) {
	if (CheckTemplateArgumentAddressOfObjectOrFunction(*this, Param,
	ParamType,
	Arg, Converted))
	return ExprError();
	return Arg;
	}

	if (CheckTemplateArgumentPointerToMember(*this, Param, ParamType, Arg,
	Converted))
	return ExprError();
	return Arg;
	}

	if (ParamType->isPointerType()) {
	// -- for a non-type template-parameter of type pointer to
	// object, qualification conversions (4.4) and the
	// array-to-pointer conversion (4.2) are applied.
	// C++0x also allows a value of std::nullptr_t.
	assert(ParamType->getPointeeType()->isIncompleteOrObjectType() &&
	"Only object pointers allowed here");

	if (CheckTemplateArgumentAddressOfObjectOrFunction(*this, Param,
	ParamType,
	Arg, Converted))
	return ExprError();
	return Arg;
	}

	if (const ReferenceType *ParamRefType = ParamType->getAs<ReferenceType>()) {
	// -- For a non-type template-parameter of type reference to
	// object, no conversions apply. The type referred to by the
	// reference may be more cv-qualified than the (otherwise
	// identical) type of the template-argument. The
	// template-parameter is bound directly to the
	// template-argument, which must be an lvalue.
	assert(ParamRefType->getPointeeType()->isIncompleteOrObjectType() &&
	"Only object references allowed here");

	if (Arg->getType() == Context.OverloadTy) {
	if (FunctionDecl *Fn = ResolveAddressOfOverloadedFunction(Arg,
	ParamRefType->getPointeeType(),
	true,
	FoundResult)) {
	if (DiagnoseUseOfDecl(Fn, Arg->getBeginLoc()))
	return ExprError();

	Arg = FixOverloadedFunctionReference(Arg, FoundResult, Fn);
	ArgType = Arg->getType();
	} else
	return ExprError();
	}

	if (CheckTemplateArgumentAddressOfObjectOrFunction(*this, Param,
	ParamType,
	Arg, Converted))
	return ExprError();
	return Arg;
	}

	// Deal with parameters of type std::nullptr_t.
	if (ParamType->isNullPtrType()) {
	if (Arg->isTypeDependent() \|\| Arg->isValueDependent()) {
	Converted = TemplateArgument(Arg);
	return Arg;
	}

	switch (isNullPointerValueTemplateArgument(*this, Param, ParamType, Arg)) {
	case NPV_NotNullPointer:
	Diag(Arg->getExprLoc(), diag::err_template_arg_not_convertible)
	<< Arg->getType() << ParamType;
	Diag(Param->getLocation(), diag::note_template_param_here);
	return ExprError();

	case NPV_Error:
	return ExprError();

	case NPV_NullPointer:
	Diag(Arg->getExprLoc(), diag::warn_cxx98_compat_template_arg_null);
	Converted = TemplateArgument(Context.getCanonicalType(ParamType),
	/isNullPtr/true);
	return Arg;
	}
	}

	// -- For a non-type template-parameter of type pointer to data
	// member, qualification conversions (4.4) are applied.
	assert(ParamType->isMemberPointerType() && "Only pointers to members remain");

	if (CheckTemplateArgumentPointerToMember(*this, Param, ParamType, Arg,
	Converted))
	return ExprError();
	return Arg;
	}

	static void DiagnoseTemplateParameterListArityMismatch(
	Sema &S, TemplateParameterList New, TemplateParameterList Old,
	Sema::TemplateParameterListEqualKind Kind, SourceLocation TemplateArgLoc);

	/// Check a template argument against its corresponding
	/// template template parameter.
	///
	/// This routine implements the semantics of C++ [temp.arg.template].
	/// It returns true if an error occurred, and false otherwise.
	bool Sema::CheckTemplateTemplateArgument(TemplateParameterList *Params,
	TemplateArgumentLoc &Arg) {
	TemplateName Name = Arg.getArgument().getAsTemplateOrTemplatePattern();
	TemplateDecl *Template = Name.getAsTemplateDecl();
	if (!Template) {
	// Any dependent template name is fine.
	assert(Name.isDependent() && "Non-dependent template isn't a declaration?");
	return false;
	}

	if (Template->isInvalidDecl())
	return true;

	// C++0x [temp.arg.template]p1:
	// A template-argument for a template template-parameter shall be
	// the name of a class template or an alias template, expressed as an
	// id-expression. When the template-argument names a class template, only
	// primary class templates are considered when matching the
	// template template argument with the corresponding parameter;
	// partial specializations are not considered even if their
	// parameter lists match that of the template template parameter.
	//
	// Note that we also allow template template parameters here, which
	// will happen when we are dealing with, e.g., class template
	// partial specializations.
	if (!isa<ClassTemplateDecl>(Template) &&
	!isa<TemplateTemplateParmDecl>(Template) &&
	!isa<TypeAliasTemplateDecl>(Template) &&
	!isa<BuiltinTemplateDecl>(Template)) {
	assert(isa<FunctionTemplateDecl>(Template) &&
	"Only function templates are possible here");
	Diag(Arg.getLocation(), diag::err_template_arg_not_valid_template);
	Diag(Template->getLocation(), diag::note_template_arg_refers_here_func)
	<< Template;
	}

	// C++1z [temp.arg.template]p3: (DR 150)
	// A template-argument matches a template template-parameter P when P
	// is at least as specialized as the template-argument A.
	if (getLangOpts().RelaxedTemplateTemplateArgs) {
	// Quick check for the common case:
	// If P contains a parameter pack, then A [...] matches P if each of A's
	// template parameters matches the corresponding template parameter in
	// the template-parameter-list of P.
	if (TemplateParameterListsAreEqual(
	Template->getTemplateParameters(), Params, false,
	TPL_TemplateTemplateArgumentMatch, Arg.getLocation()))
	return false;

	if (isTemplateTemplateParameterAtLeastAsSpecializedAs(Params, Template,
	Arg.getLocation()))
	return false;
	// FIXME: Produce better diagnostics for deduction failures.
	}

	return !TemplateParameterListsAreEqual(Template->getTemplateParameters(),
	Params,
	true,
	TPL_TemplateTemplateArgumentMatch,
	Arg.getLocation());
	}

	/// Given a non-type template argument that refers to a
	/// declaration and the type of its corresponding non-type template
	/// parameter, produce an expression that properly refers to that
	/// declaration.
	ExprResult
	Sema::BuildExpressionFromDeclTemplateArgument(const TemplateArgument &Arg,
	QualType ParamType,
	SourceLocation Loc) {
	// C++ [temp.param]p8:
	//
	// A non-type template-parameter of type "array of T" or
	// "function returning T" is adjusted to be of type "pointer to
	// T" or "pointer to function returning T", respectively.
	if (ParamType->isArrayType())
	ParamType = Context.getArrayDecayedType(ParamType);
	else if (ParamType->isFunctionType())
	ParamType = Context.getPointerType(ParamType);

	// For a NULL non-type template argument, return nullptr casted to the
	// parameter's type.
	if (Arg.getKind() == TemplateArgument::NullPtr) {
	return ImpCastExprToType(
	new (Context) CXXNullPtrLiteralExpr(Context.NullPtrTy, Loc),
	ParamType,
	ParamType->getAs<MemberPointerType>()
	? CK_NullToMemberPointer
	: CK_NullToPointer);
	}
	assert(Arg.getKind() == TemplateArgument::Declaration &&
	"Only declaration template arguments permitted here");

	ValueDecl *VD = Arg.getAsDecl();

	if (VD->getDeclContext()->isRecord() &&
	(isa<CXXMethodDecl>(VD) \|\| isa<FieldDecl>(VD) \|\|
	isa<IndirectFieldDecl>(VD))) {
	// If the value is a class member, we might have a pointer-to-member.
	// Determine whether the non-type template template parameter is of
	// pointer-to-member type. If so, we need to build an appropriate
	// expression for a pointer-to-member, since a "normal" DeclRefExpr
	// would refer to the member itself.
	if (ParamType->isMemberPointerType()) {
	QualType ClassType
	= Context.getTypeDeclType(cast<RecordDecl>(VD->getDeclContext()));
	NestedNameSpecifier *Qualifier
	= NestedNameSpecifier::Create(Context, nullptr, false,
	ClassType.getTypePtr());
	CXXScopeSpec SS;
	SS.MakeTrivial(Context, Qualifier, Loc);

	// The actual value-ness of this is unimportant, but for
	// internal consistency's sake, references to instance methods
	// are r-values.
	ExprValueKind VK = VK_LValue;
	if (isa<CXXMethodDecl>(VD) && cast<CXXMethodDecl>(VD)->isInstance())
	VK = VK_RValue;

	ExprResult RefExpr = BuildDeclRefExpr(VD,
	VD->getType().getNonReferenceType(),
	VK,
	Loc,
	&SS);
	if (RefExpr.isInvalid())
	return ExprError();

	RefExpr = CreateBuiltinUnaryOp(Loc, UO_AddrOf, RefExpr.get());

	// We might need to perform a trailing qualification conversion, since
	// the element type on the parameter could be more qualified than the
	// element type in the expression we constructed.
	bool ObjCLifetimeConversion;
	if (IsQualificationConversion(((Expr*) RefExpr.get())->getType(),
	ParamType.getUnqualifiedType(), false,
	ObjCLifetimeConversion))
	RefExpr = ImpCastExprToType(RefExpr.get(), ParamType.getUnqualifiedType(), CK_NoOp);

	assert(!RefExpr.isInvalid() &&
	Context.hasSameType(((Expr*) RefExpr.get())->getType(),
	ParamType.getUnqualifiedType()));
	return RefExpr;
	}
	}

	QualType T = VD->getType().getNonReferenceType();

	if (ParamType->isPointerType()) {
	// When the non-type template parameter is a pointer, take the
	// address of the declaration.
	ExprResult RefExpr = BuildDeclRefExpr(VD, T, VK_LValue, Loc);
	if (RefExpr.isInvalid())
	return ExprError();

	if (!Context.hasSameUnqualifiedType(ParamType->getPointeeType(), T) &&
	(T->isFunctionType() \|\| T->isArrayType())) {
	// Decay functions and arrays unless we're forming a pointer to array.
	RefExpr = DefaultFunctionArrayConversion(RefExpr.get());
	if (RefExpr.isInvalid())
	return ExprError();

	return RefExpr;
	}

	// Take the address of everything else
	return CreateBuiltinUnaryOp(Loc, UO_AddrOf, RefExpr.get());
	}

	ExprValueKind VK = VK_RValue;

	// If the non-type template parameter has reference type, qualify the
	// resulting declaration reference with the extra qualifiers on the
	// type that the reference refers to.
	if (const ReferenceType *TargetRef = ParamType->getAs<ReferenceType>()) {
	VK = VK_LValue;
	T = Context.getQualifiedType(T,
	TargetRef->getPointeeType().getQualifiers());
	} else if (isa<FunctionDecl>(VD)) {
	// References to functions are always lvalues.
	VK = VK_LValue;
	}

	return BuildDeclRefExpr(VD, T, VK, Loc);
	}

	/// Construct a new expression that refers to the given
	/// integral template argument with the given source-location
	/// information.
	///
	/// This routine takes care of the mapping from an integral template
	/// argument (which may have any integral type) to the appropriate
	/// literal value.
	ExprResult
	Sema::BuildExpressionFromIntegralTemplateArgument(const TemplateArgument &Arg,
	SourceLocation Loc) {
	assert(Arg.getKind() == TemplateArgument::Integral &&
	"Operation is only valid for integral template arguments");
	QualType OrigT = Arg.getIntegralType();

	// If this is an enum type that we're instantiating, we need to use an integer
	// type the same size as the enumerator. We don't want to build an
	// IntegerLiteral with enum type. The integer type of an enum type can be of
	// any integral type with C++11 enum classes, make sure we create the right
	// type of literal for it.
	QualType T = OrigT;
	if (const EnumType *ET = OrigT->getAs<EnumType>())
	T = ET->getDecl()->getIntegerType();

	Expr *E;
	if (T->isAnyCharacterType()) {
	CharacterLiteral::CharacterKind Kind;
	if (T->isWideCharType())
	Kind = CharacterLiteral::Wide;
	else if (T->isChar8Type() && getLangOpts().Char8)
	Kind = CharacterLiteral::UTF8;
	else if (T->isChar16Type())
	Kind = CharacterLiteral::UTF16;
	else if (T->isChar32Type())
	Kind = CharacterLiteral::UTF32;
	else
	Kind = CharacterLiteral::Ascii;

	E = new (Context) CharacterLiteral(Arg.getAsIntegral().getZExtValue(),
	Kind, T, Loc);
	} else if (T->isBooleanType()) {
	E = new (Context) CXXBoolLiteralExpr(Arg.getAsIntegral().getBoolValue(),
	T, Loc);
	} else if (T->isNullPtrType()) {
	E = new (Context) CXXNullPtrLiteralExpr(Context.NullPtrTy, Loc);
	} else {
	E = IntegerLiteral::Create(Context, Arg.getAsIntegral(), T, Loc);
	}

	if (OrigT->isEnumeralType()) {
	// FIXME: This is a hack. We need a better way to handle substituted
	// non-type template parameters.
	E = CStyleCastExpr::Create(Context, OrigT, VK_RValue, CK_IntegralCast, E,
	nullptr,
	Context.getTrivialTypeSourceInfo(OrigT, Loc),
	Loc, Loc);
	}

	return E;
	}

	/// Match two template parameters within template parameter lists.
	static bool MatchTemplateParameterKind(Sema &S, NamedDecl New, NamedDecl Old,
	bool Complain,
	Sema::TemplateParameterListEqualKind Kind,
	SourceLocation TemplateArgLoc) {
	// Check the actual kind (type, non-type, template).
	if (Old->getKind() != New->getKind()) {
	if (Complain) {
	unsigned NextDiag = diag::err_template_param_different_kind;
	if (TemplateArgLoc.isValid()) {
	S.Diag(TemplateArgLoc, diag::err_template_arg_template_params_mismatch);
	NextDiag = diag::note_template_param_different_kind;
	}
	S.Diag(New->getLocation(), NextDiag)
	<< (Kind != Sema::TPL_TemplateMatch);
	S.Diag(Old->getLocation(), diag::note_template_prev_declaration)
	<< (Kind != Sema::TPL_TemplateMatch);
	}

	return false;
	}

	// Check that both are parameter packs or neither are parameter packs.
	// However, if we are matching a template template argument to a
	// template template parameter, the template template parameter can have
	// a parameter pack where the template template argument does not.
	if (Old->isTemplateParameterPack() != New->isTemplateParameterPack() &&
	!(Kind == Sema::TPL_TemplateTemplateArgumentMatch &&
	Old->isTemplateParameterPack())) {
	if (Complain) {
	unsigned NextDiag = diag::err_template_parameter_pack_non_pack;
	if (TemplateArgLoc.isValid()) {
	S.Diag(TemplateArgLoc,
	diag::err_template_arg_template_params_mismatch);
	NextDiag = diag::note_template_parameter_pack_non_pack;
	}

	unsigned ParamKind = isa<TemplateTypeParmDecl>(New)? 0
	: isa<NonTypeTemplateParmDecl>(New)? 1
	: 2;
	S.Diag(New->getLocation(), NextDiag)
	<< ParamKind << New->isParameterPack();
	S.Diag(Old->getLocation(), diag::note_template_parameter_pack_here)
	<< ParamKind << Old->isParameterPack();
	}

	return false;
	}

	// For non-type template parameters, check the type of the parameter.
	if (NonTypeTemplateParmDecl *OldNTTP
	= dyn_cast<NonTypeTemplateParmDecl>(Old)) {
	NonTypeTemplateParmDecl *NewNTTP = cast<NonTypeTemplateParmDecl>(New);

	// If we are matching a template template argument to a template
	// template parameter and one of the non-type template parameter types
	// is dependent, then we must wait until template instantiation time
	// to actually compare the arguments.
	if (Kind == Sema::TPL_TemplateTemplateArgumentMatch &&
	(OldNTTP->getType()->isDependentType() \|\|
	NewNTTP->getType()->isDependentType()))
	return true;

	if (!S.Context.hasSameType(OldNTTP->getType(), NewNTTP->getType())) {
	if (Complain) {
	unsigned NextDiag = diag::err_template_nontype_parm_different_type;
	if (TemplateArgLoc.isValid()) {
	S.Diag(TemplateArgLoc,
	diag::err_template_arg_template_params_mismatch);
	NextDiag = diag::note_template_nontype_parm_different_type;
	}
	S.Diag(NewNTTP->getLocation(), NextDiag)
	<< NewNTTP->getType()
	<< (Kind != Sema::TPL_TemplateMatch);
	S.Diag(OldNTTP->getLocation(),
	diag::note_template_nontype_parm_prev_declaration)
	<< OldNTTP->getType();
	}

	return false;
	}

	return true;
	}

	// For template template parameters, check the template parameter types.
	// The template parameter lists of template template
	// parameters must agree.
	if (TemplateTemplateParmDecl *OldTTP
	= dyn_cast<TemplateTemplateParmDecl>(Old)) {
	TemplateTemplateParmDecl *NewTTP = cast<TemplateTemplateParmDecl>(New);
	return S.TemplateParameterListsAreEqual(NewTTP->getTemplateParameters(),
	OldTTP->getTemplateParameters(),
	Complain,
	(Kind == Sema::TPL_TemplateMatch
	? Sema::TPL_TemplateTemplateParmMatch
	: Kind),
	TemplateArgLoc);
	}

	return true;
	}

	/// Diagnose a known arity mismatch when comparing template argument
	/// lists.
	static
	void DiagnoseTemplateParameterListArityMismatch(Sema &S,
	TemplateParameterList *New,
	TemplateParameterList *Old,
	Sema::TemplateParameterListEqualKind Kind,
	SourceLocation TemplateArgLoc) {
	unsigned NextDiag = diag::err_template_param_list_different_arity;
	if (TemplateArgLoc.isValid()) {
	S.Diag(TemplateArgLoc, diag::err_template_arg_template_params_mismatch);
	NextDiag = diag::note_template_param_list_different_arity;
	}
	S.Diag(New->getTemplateLoc(), NextDiag)
	<< (New->size() > Old->size())
	<< (Kind != Sema::TPL_TemplateMatch)
	<< SourceRange(New->getTemplateLoc(), New->getRAngleLoc());
	S.Diag(Old->getTemplateLoc(), diag::note_template_prev_declaration)
	<< (Kind != Sema::TPL_TemplateMatch)
	<< SourceRange(Old->getTemplateLoc(), Old->getRAngleLoc());
	}

	/// Determine whether the given template parameter lists are
	/// equivalent.
	///
	/// \param New The new template parameter list, typically written in the
	/// source code as part of a new template declaration.
	///
	/// \param Old The old template parameter list, typically found via
	/// name lookup of the template declared with this template parameter
	/// list.
	///
	/// \param Complain If true, this routine will produce a diagnostic if
	/// the template parameter lists are not equivalent.
	///
	/// \param Kind describes how we are to match the template parameter lists.
	///
	/// \param TemplateArgLoc If this source location is valid, then we
	/// are actually checking the template parameter list of a template
	/// argument (New) against the template parameter list of its
	/// corresponding template template parameter (Old). We produce
	/// slightly different diagnostics in this scenario.
	///
	/// \returns True if the template parameter lists are equal, false
	/// otherwise.
	bool
	Sema::TemplateParameterListsAreEqual(TemplateParameterList *New,
	TemplateParameterList *Old,
	bool Complain,
	TemplateParameterListEqualKind Kind,
	SourceLocation TemplateArgLoc) {
	if (Old->size() != New->size() && Kind != TPL_TemplateTemplateArgumentMatch) {
	if (Complain)
	DiagnoseTemplateParameterListArityMismatch(*this, New, Old, Kind,
	TemplateArgLoc);

	return false;
	}

	// C++0x [temp.arg.template]p3:
	// A template-argument matches a template template-parameter (call it P)
	// when each of the template parameters in the template-parameter-list of
	// the template-argument's corresponding class template or alias template
	// (call it A) matches the corresponding template parameter in the
	// template-parameter-list of P. [...]
	TemplateParameterList::iterator NewParm = New->begin();
	TemplateParameterList::iterator NewParmEnd = New->end();
	for (TemplateParameterList::iterator OldParm = Old->begin(),
	OldParmEnd = Old->end();
	OldParm != OldParmEnd; ++OldParm) {
	if (Kind != TPL_TemplateTemplateArgumentMatch \|\|
	!(*OldParm)->isTemplateParameterPack()) {
	if (NewParm == NewParmEnd) {
	if (Complain)
	DiagnoseTemplateParameterListArityMismatch(*this, New, Old, Kind,
	TemplateArgLoc);

	return false;
	}

	if (!MatchTemplateParameterKind(this, NewParm, *OldParm, Complain,
	Kind, TemplateArgLoc))
	return false;

	++NewParm;
	continue;
	}

	// C++0x [temp.arg.template]p3:
	// [...] When P's template- parameter-list contains a template parameter
	// pack (14.5.3), the template parameter pack will match zero or more
	// template parameters or template parameter packs in the
	// template-parameter-list of A with the same type and form as the
	// template parameter pack in P (ignoring whether those template
	// parameters are template parameter packs).
	for (; NewParm != NewParmEnd; ++NewParm) {
	if (!MatchTemplateParameterKind(this, NewParm, *OldParm, Complain,
	Kind, TemplateArgLoc))
	return false;
	}
	}

	// Make sure we exhausted all of the arguments.
	if (NewParm != NewParmEnd) {
	if (Complain)
	DiagnoseTemplateParameterListArityMismatch(*this, New, Old, Kind,
	TemplateArgLoc);

	return false;
	}

	return true;
	}

	/// Check whether a template can be declared within this scope.
	///
	/// If the template declaration is valid in this scope, returns
	/// false. Otherwise, issues a diagnostic and returns true.
	bool
	Sema::CheckTemplateDeclScope(Scope S, TemplateParameterList TemplateParams) {
	if (!S)
	return false;

	// Find the nearest enclosing declaration scope.
	while ((S->getFlags() & Scope::DeclScope) == 0 \|\|
	(S->getFlags() & Scope::TemplateParamScope) != 0)
	S = S->getParent();

	// C++ [temp]p4:
	// A template [...] shall not have C linkage.
	DeclContext *Ctx = S->getEntity();
	if (Ctx && Ctx->isExternCContext()) {
	Diag(TemplateParams->getTemplateLoc(), diag::err_template_linkage)
	<< TemplateParams->getSourceRange();
	if (const LinkageSpecDecl *LSD = Ctx->getExternCContext())
	Diag(LSD->getExternLoc(), diag::note_extern_c_begins_here);
	return true;
	}
	Ctx = Ctx->getRedeclContext();

	// C++ [temp]p2:
	// A template-declaration can appear only as a namespace scope or
	// class scope declaration.
	if (Ctx) {
	if (Ctx->isFileContext())
	return false;
	if (CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(Ctx)) {
	// C++ [temp.mem]p2:
	// A local class shall not have member templates.
	if (RD->isLocalClass())
	return Diag(TemplateParams->getTemplateLoc(),
	diag::err_template_inside_local_class)
	<< TemplateParams->getSourceRange();
	else
	return false;
	}
	}

	return Diag(TemplateParams->getTemplateLoc(),
	diag::err_template_outside_namespace_or_class_scope)
	<< TemplateParams->getSourceRange();
	}

	/// Determine what kind of template specialization the given declaration
	/// is.
	static TemplateSpecializationKind getTemplateSpecializationKind(Decl *D) {
	if (!D)
	return TSK_Undeclared;

	if (CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(D))
	return Record->getTemplateSpecializationKind();
	if (FunctionDecl *Function = dyn_cast<FunctionDecl>(D))
	return Function->getTemplateSpecializationKind();
	if (VarDecl *Var = dyn_cast<VarDecl>(D))
	return Var->getTemplateSpecializationKind();

	return TSK_Undeclared;
	}

	/// Check whether a specialization is well-formed in the current
	/// context.
	///
	/// This routine determines whether a template specialization can be declared
	/// in the current context (C++ [temp.expl.spec]p2).
	///
	/// \param S the semantic analysis object for which this check is being
	/// performed.
	///
	/// \param Specialized the entity being specialized or instantiated, which
	/// may be a kind of template (class template, function template, etc.) or
	/// a member of a class template (member function, static data member,
	/// member class).
	///
	/// \param PrevDecl the previous declaration of this entity, if any.
	///
	/// \param Loc the location of the explicit specialization or instantiation of
	/// this entity.
	///
	/// \param IsPartialSpecialization whether this is a partial specialization of
	/// a class template.
	///
	/// \returns true if there was an error that we cannot recover from, false
	/// otherwise.
	static bool CheckTemplateSpecializationScope(Sema &S,
	NamedDecl *Specialized,
	NamedDecl *PrevDecl,
	SourceLocation Loc,
	bool IsPartialSpecialization) {
	// Keep these "kind" numbers in sync with the %select statements in the
	// various diagnostics emitted by this routine.
	int EntityKind = 0;
	if (isa<ClassTemplateDecl>(Specialized))
	EntityKind = IsPartialSpecialization? 1 : 0;
	else if (isa<VarTemplateDecl>(Specialized))
	EntityKind = IsPartialSpecialization ? 3 : 2;
	else if (isa<FunctionTemplateDecl>(Specialized))
	EntityKind = 4;
	else if (isa<CXXMethodDecl>(Specialized))
	EntityKind = 5;
	else if (isa<VarDecl>(Specialized))
	EntityKind = 6;
	else if (isa<RecordDecl>(Specialized))
	EntityKind = 7;
	else if (isa<EnumDecl>(Specialized) && S.getLangOpts().CPlusPlus11)
	EntityKind = 8;
	else {
	S.Diag(Loc, diag::err_template_spec_unknown_kind)
	<< S.getLangOpts().CPlusPlus11;
	S.Diag(Specialized->getLocation(), diag::note_specialized_entity);
	return true;
	}

	// C++ [temp.expl.spec]p2:
	// An explicit specialization may be declared in any scope in which
	// the corresponding primary template may be defined.
	if (S.CurContext->getRedeclContext()->isFunctionOrMethod()) {
	S.Diag(Loc, diag::err_template_spec_decl_function_scope)
	<< Specialized;
	return true;
	}

	// C++ [temp.class.spec]p6:
	// A class template partial specialization may be declared in any
	// scope in which the primary template may be defined.
	DeclContext *SpecializedContext =
	Specialized->getDeclContext()->getRedeclContext();
	DeclContext *DC = S.CurContext->getRedeclContext();

	// Make sure that this redeclaration (or definition) occurs in the same
	// scope or an enclosing namespace.
	if (!(DC->isFileContext() ? DC->Encloses(SpecializedContext)
	: DC->Equals(SpecializedContext))) {
	if (isa<TranslationUnitDecl>(SpecializedContext))
	S.Diag(Loc, diag::err_template_spec_redecl_global_scope)
	<< EntityKind << Specialized;
	else {
	auto *ND = cast<NamedDecl>(SpecializedContext);
	int Diag = diag::err_template_spec_redecl_out_of_scope;
	if (S.getLangOpts().MicrosoftExt && !DC->isRecord())
	Diag = diag::ext_ms_template_spec_redecl_out_of_scope;
	S.Diag(Loc, Diag) << EntityKind << Specialized
	<< ND << isa<CXXRecordDecl>(ND);
	}

	S.Diag(Specialized->getLocation(), diag::note_specialized_entity);

	// Don't allow specializing in the wrong class during error recovery.
	// Otherwise, things can go horribly wrong.
	if (DC->isRecord())
	return true;
	}

	return false;
	}

	static SourceRange findTemplateParameterInType(unsigned Depth, Expr *E) {
	if (!E->isTypeDependent())
	return SourceLocation();
	DependencyChecker Checker(Depth, /IgnoreNonTypeDependent/true);
	Checker.TraverseStmt(E);
	if (Checker.MatchLoc.isInvalid())
	return E->getSourceRange();
	return Checker.MatchLoc;
	}

	static SourceRange findTemplateParameter(unsigned Depth, TypeLoc TL) {
	if (!TL.getType()->isDependentType())
	return SourceLocation();
	DependencyChecker Checker(Depth, /IgnoreNonTypeDependent/true);
	Checker.TraverseTypeLoc(TL);
	if (Checker.MatchLoc.isInvalid())
	return TL.getSourceRange();
	return Checker.MatchLoc;
	}

	/// Subroutine of Sema::CheckTemplatePartialSpecializationArgs
	/// that checks non-type template partial specialization arguments.
	static bool CheckNonTypeTemplatePartialSpecializationArgs(
	Sema &S, SourceLocation TemplateNameLoc, NonTypeTemplateParmDecl *Param,
	const TemplateArgument *Args, unsigned NumArgs, bool IsDefaultArgument) {
	for (unsigned I = 0; I != NumArgs; ++I) {
	if (Args[I].getKind() == TemplateArgument::Pack) {
	if (CheckNonTypeTemplatePartialSpecializationArgs(
	S, TemplateNameLoc, Param, Args[I].pack_begin(),
	Args[I].pack_size(), IsDefaultArgument))
	return true;

	continue;
	}

	if (Args[I].getKind() != TemplateArgument::Expression)
	continue;

	Expr *ArgExpr = Args[I].getAsExpr();

	// We can have a pack expansion of any of the bullets below.
	if (PackExpansionExpr *Expansion = dyn_cast<PackExpansionExpr>(ArgExpr))
	ArgExpr = Expansion->getPattern();

	// Strip off any implicit casts we added as part of type checking.
	while (ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(ArgExpr))
	ArgExpr = ICE->getSubExpr();

	// C++ [temp.class.spec]p8:
	// A non-type argument is non-specialized if it is the name of a
	// non-type parameter. All other non-type arguments are
	// specialized.
	//
	// Below, we check the two conditions that only apply to
	// specialized non-type arguments, so skip any non-specialized
	// arguments.
	if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(ArgExpr))
	if (isa<NonTypeTemplateParmDecl>(DRE->getDecl()))
	continue;

	// C++ [temp.class.spec]p9:
	// Within the argument list of a class template partial
	// specialization, the following restrictions apply:
	// -- A partially specialized non-type argument expression
	// shall not involve a template parameter of the partial
	// specialization except when the argument expression is a
	// simple identifier.
	// -- The type of a template parameter corresponding to a
	// specialized non-type argument shall not be dependent on a
	// parameter of the specialization.
	// DR1315 removes the first bullet, leaving an incoherent set of rules.
	// We implement a compromise between the original rules and DR1315:
	// -- A specialized non-type template argument shall not be
	// type-dependent and the corresponding template parameter
	// shall have a non-dependent type.
	SourceRange ParamUseRange =
	findTemplateParameterInType(Param->getDepth(), ArgExpr);
	if (ParamUseRange.isValid()) {
	if (IsDefaultArgument) {
	S.Diag(TemplateNameLoc,
	diag::err_dependent_non_type_arg_in_partial_spec);
	S.Diag(ParamUseRange.getBegin(),
	diag::note_dependent_non_type_default_arg_in_partial_spec)
	<< ParamUseRange;
	} else {
	S.Diag(ParamUseRange.getBegin(),
	diag::err_dependent_non_type_arg_in_partial_spec)
	<< ParamUseRange;
	}
	return true;
	}

	ParamUseRange = findTemplateParameter(
	Param->getDepth(), Param->getTypeSourceInfo()->getTypeLoc());
	if (ParamUseRange.isValid()) {
	S.Diag(IsDefaultArgument ? TemplateNameLoc : ArgExpr->getBeginLoc(),
	diag::err_dependent_typed_non_type_arg_in_partial_spec)
	<< Param->getType();
	S.Diag(Param->getLocation(), diag::note_template_param_here)
	<< (IsDefaultArgument ? ParamUseRange : SourceRange())
	<< ParamUseRange;
	return true;
	}
	}

	return false;
	}

	/// Check the non-type template arguments of a class template
	/// partial specialization according to C++ [temp.class.spec]p9.
	///
	/// \param TemplateNameLoc the location of the template name.
	/// \param PrimaryTemplate the template parameters of the primary class
	/// template.
	/// \param NumExplicit the number of explicitly-specified template arguments.
	/// \param TemplateArgs the template arguments of the class template
	/// partial specialization.
	///
	/// \returns \c true if there was an error, \c false otherwise.
	bool Sema::CheckTemplatePartialSpecializationArgs(
	SourceLocation TemplateNameLoc, TemplateDecl *PrimaryTemplate,
	unsigned NumExplicit, ArrayRef<TemplateArgument> TemplateArgs) {
	// We have to be conservative when checking a template in a dependent
	// context.
	if (PrimaryTemplate->getDeclContext()->isDependentContext())
	return false;

	TemplateParameterList *TemplateParams =
	PrimaryTemplate->getTemplateParameters();
	for (unsigned I = 0, N = TemplateParams->size(); I != N; ++I) {
	NonTypeTemplateParmDecl *Param
	= dyn_cast<NonTypeTemplateParmDecl>(TemplateParams->getParam(I));
	if (!Param)
	continue;

	if (CheckNonTypeTemplatePartialSpecializationArgs(*this, TemplateNameLoc,
	Param, &TemplateArgs[I],
	1, I >= NumExplicit))
	return true;
	}

	return false;
	}

	DeclResult Sema::ActOnClassTemplateSpecialization(
	Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
	SourceLocation ModulePrivateLoc, TemplateIdAnnotation &TemplateId,
	const ParsedAttributesView &Attr,
	MultiTemplateParamsArg TemplateParameterLists, SkipBodyInfo *SkipBody) {
	assert(TUK != TUK_Reference && "References are not specializations");

	CXXScopeSpec &SS = TemplateId.SS;

	// NOTE: KWLoc is the location of the tag keyword. This will instead
	// store the location of the outermost template keyword in the declaration.
	SourceLocation TemplateKWLoc = TemplateParameterLists.size() > 0
	? TemplateParameterLists[0]->getTemplateLoc() : KWLoc;
	SourceLocation TemplateNameLoc = TemplateId.TemplateNameLoc;
	SourceLocation LAngleLoc = TemplateId.LAngleLoc;
	SourceLocation RAngleLoc = TemplateId.RAngleLoc;

	// Find the class template we're specializing
	TemplateName Name = TemplateId.Template.get();
	ClassTemplateDecl *ClassTemplate
	= dyn_cast_or_null<ClassTemplateDecl>(Name.getAsTemplateDecl());

	if (!ClassTemplate) {
	Diag(TemplateNameLoc, diag::err_not_class_template_specialization)
	<< (Name.getAsTemplateDecl() &&
	isa<TemplateTemplateParmDecl>(Name.getAsTemplateDecl()));
	return true;
	}

	bool isMemberSpecialization = false;
	bool isPartialSpecialization = false;

	// Check the validity of the template headers that introduce this
	// template.
	// FIXME: We probably shouldn't complain about these headers for
	// friend declarations.
	bool Invalid = false;
	TemplateParameterList *TemplateParams =
	MatchTemplateParametersToScopeSpecifier(
	KWLoc, TemplateNameLoc, SS, &TemplateId,
	TemplateParameterLists, TUK == TUK_Friend, isMemberSpecialization,
	Invalid);
	if (Invalid)
	return true;

	if (TemplateParams && TemplateParams->size() > 0) {
	isPartialSpecialization = true;

	if (TUK == TUK_Friend) {
	Diag(KWLoc, diag::err_partial_specialization_friend)
	<< SourceRange(LAngleLoc, RAngleLoc);
	return true;
	}

	// C++ [temp.class.spec]p10:
	// The template parameter list of a specialization shall not
	// contain default template argument values.
	for (unsigned I = 0, N = TemplateParams->size(); I != N; ++I) {
	Decl *Param = TemplateParams->getParam(I);
	if (TemplateTypeParmDecl *TTP = dyn_cast<TemplateTypeParmDecl>(Param)) {
	if (TTP->hasDefaultArgument()) {
	Diag(TTP->getDefaultArgumentLoc(),
	diag::err_default_arg_in_partial_spec);
	TTP->removeDefaultArgument();
	}
	} else if (NonTypeTemplateParmDecl *NTTP
	= dyn_cast<NonTypeTemplateParmDecl>(Param)) {
	if (Expr *DefArg = NTTP->getDefaultArgument()) {
	Diag(NTTP->getDefaultArgumentLoc(),
	diag::err_default_arg_in_partial_spec)
	<< DefArg->getSourceRange();
	NTTP->removeDefaultArgument();
	}
	} else {
	TemplateTemplateParmDecl *TTP = cast<TemplateTemplateParmDecl>(Param);
	if (TTP->hasDefaultArgument()) {
	Diag(TTP->getDefaultArgument().getLocation(),
	diag::err_default_arg_in_partial_spec)
	<< TTP->getDefaultArgument().getSourceRange();
	TTP->removeDefaultArgument();
	}
	}
	}
	} else if (TemplateParams) {
	if (TUK == TUK_Friend)
	Diag(KWLoc, diag::err_template_spec_friend)
	<< FixItHint::CreateRemoval(
	SourceRange(TemplateParams->getTemplateLoc(),
	TemplateParams->getRAngleLoc()))
	<< SourceRange(LAngleLoc, RAngleLoc);
	} else {
	assert(TUK == TUK_Friend && "should have a 'template<>' for this decl");
	}

	// Check that the specialization uses the same tag kind as the
	// original template.
	TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
	assert(Kind != TTK_Enum && "Invalid enum tag in class template spec!");
	if (!isAcceptableTagRedeclaration(ClassTemplate->getTemplatedDecl(),
	Kind, TUK == TUK_Definition, KWLoc,
	ClassTemplate->getIdentifier())) {
	Diag(KWLoc, diag::err_use_with_wrong_tag)
	<< ClassTemplate
	<< FixItHint::CreateReplacement(KWLoc,
	ClassTemplate->getTemplatedDecl()->getKindName());
	Diag(ClassTemplate->getTemplatedDecl()->getLocation(),
	diag::note_previous_use);
	Kind = ClassTemplate->getTemplatedDecl()->getTagKind();
	}

	// Translate the parser's template argument list in our AST format.
	TemplateArgumentListInfo TemplateArgs =
	makeTemplateArgumentListInfo(*this, TemplateId);

	// Check for unexpanded parameter packs in any of the template arguments.
	for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
	if (DiagnoseUnexpandedParameterPack(TemplateArgs[I],
	UPPC_PartialSpecialization))
	return true;

	// Check that the template argument list is well-formed for this
	// template.
	SmallVector<TemplateArgument, 4> Converted;
	if (CheckTemplateArgumentList(ClassTemplate, TemplateNameLoc,
	TemplateArgs, false, Converted))
	return true;

	// Find the class template (partial) specialization declaration that
	// corresponds to these arguments.
	if (isPartialSpecialization) {
	if (CheckTemplatePartialSpecializationArgs(TemplateNameLoc, ClassTemplate,
	TemplateArgs.size(), Converted))
	return true;

	// FIXME: Move this to CheckTemplatePartialSpecializationArgs so we
	// also do it during instantiation.
	bool InstantiationDependent;
	if (!Name.isDependent() &&
	!TemplateSpecializationType::anyDependentTemplateArguments(
	TemplateArgs.arguments(), InstantiationDependent)) {
	Diag(TemplateNameLoc, diag::err_partial_spec_fully_specialized)
	<< ClassTemplate->getDeclName();
	isPartialSpecialization = false;
	}
	}

	void *InsertPos = nullptr;
	ClassTemplateSpecializationDecl *PrevDecl = nullptr;

	if (isPartialSpecialization)
	// FIXME: Template parameter list matters, too
	PrevDecl = ClassTemplate->findPartialSpecialization(Converted, InsertPos);
	else
	PrevDecl = ClassTemplate->findSpecialization(Converted, InsertPos);

	ClassTemplateSpecializationDecl *Specialization = nullptr;

	// Check whether we can declare a class template specialization in
	// the current scope.
	if (TUK != TUK_Friend &&
	CheckTemplateSpecializationScope(*this, ClassTemplate, PrevDecl,
	TemplateNameLoc,
	isPartialSpecialization))
	return true;

	// The canonical type
	QualType CanonType;
	if (isPartialSpecialization) {
	// Build the canonical type that describes the converted template
	// arguments of the class template partial specialization.
	TemplateName CanonTemplate = Context.getCanonicalTemplateName(Name);
	CanonType = Context.getTemplateSpecializationType(CanonTemplate,
	Converted);

	if (Context.hasSameType(CanonType,
	ClassTemplate->getInjectedClassNameSpecialization())) {
	// C++ [temp.class.spec]p9b3:
	//
	// -- The argument list of the specialization shall not be identical
	// to the implicit argument list of the primary template.
	//
	// This rule has since been removed, because it's redundant given DR1495,
	// but we keep it because it produces better diagnostics and recovery.
	Diag(TemplateNameLoc, diag::err_partial_spec_args_match_primary_template)
	<< /class template/0 << (TUK == TUK_Definition)
	<< FixItHint::CreateRemoval(SourceRange(LAngleLoc, RAngleLoc));
	return CheckClassTemplate(S, TagSpec, TUK, KWLoc, SS,
	ClassTemplate->getIdentifier(),
	TemplateNameLoc,
	Attr,
	TemplateParams,
	AS_none, /ModulePrivateLoc=/SourceLocation(),
	/FriendLoc/SourceLocation(),
	TemplateParameterLists.size() - 1,
	TemplateParameterLists.data());
	}

	// Create a new class template partial specialization declaration node.
	ClassTemplatePartialSpecializationDecl *PrevPartial
	= cast_or_null<ClassTemplatePartialSpecializationDecl>(PrevDecl);
	ClassTemplatePartialSpecializationDecl *Partial
	= ClassTemplatePartialSpecializationDecl::Create(Context, Kind,
	ClassTemplate->getDeclContext(),
	KWLoc, TemplateNameLoc,
	TemplateParams,
	ClassTemplate,
	Converted,
	TemplateArgs,
	CanonType,
	PrevPartial);
	SetNestedNameSpecifier(*this, Partial, SS);
	if (TemplateParameterLists.size() > 1 && SS.isSet()) {
	Partial->setTemplateParameterListsInfo(
	Context, TemplateParameterLists.drop_back(1));
	}

	if (!PrevPartial)
	ClassTemplate->AddPartialSpecialization(Partial, InsertPos);
	Specialization = Partial;

	// If we are providing an explicit specialization of a member class
	// template specialization, make a note of that.
	if (PrevPartial && PrevPartial->getInstantiatedFromMember())
	PrevPartial->setMemberSpecialization();

	CheckTemplatePartialSpecialization(Partial);
	} else {
	// Create a new class template specialization declaration node for
	// this explicit specialization or friend declaration.
	Specialization
	= ClassTemplateSpecializationDecl::Create(Context, Kind,
	ClassTemplate->getDeclContext(),
	KWLoc, TemplateNameLoc,
	ClassTemplate,
	Converted,
	PrevDecl);
	SetNestedNameSpecifier(*this, Specialization, SS);
	if (TemplateParameterLists.size() > 0) {
	Specialization->setTemplateParameterListsInfo(Context,
	TemplateParameterLists);
	}

	if (!PrevDecl)
	ClassTemplate->AddSpecialization(Specialization, InsertPos);

	if (CurContext->isDependentContext()) {
	TemplateName CanonTemplate = Context.getCanonicalTemplateName(Name);
	CanonType = Context.getTemplateSpecializationType(
	CanonTemplate, Converted);
	} else {
	CanonType = Context.getTypeDeclType(Specialization);
	}
	}

	// C++ [temp.expl.spec]p6:
	// If a template, a member template or the member of a class template is
	// explicitly specialized then that specialization shall be declared
	// before the first use of that specialization that would cause an implicit
	// instantiation to take place, in every translation unit in which such a
	// use occurs; no diagnostic is required.
	if (PrevDecl && PrevDecl->getPointOfInstantiation().isValid()) {
	bool Okay = false;
	for (Decl *Prev = PrevDecl; Prev; Prev = Prev->getPreviousDecl()) {
	// Is there any previous explicit specialization declaration?
	if (getTemplateSpecializationKind(Prev) == TSK_ExplicitSpecialization) {
	Okay = true;
	break;
	}
	}

	if (!Okay) {
	SourceRange Range(TemplateNameLoc, RAngleLoc);
	Diag(TemplateNameLoc, diag::err_specialization_after_instantiation)
	<< Context.getTypeDeclType(Specialization) << Range;

	Diag(PrevDecl->getPointOfInstantiation(),
	diag::note_instantiation_required_here)
	<< (PrevDecl->getTemplateSpecializationKind()
	!= TSK_ImplicitInstantiation);
	return true;
	}
	}

	// If this is not a friend, note that this is an explicit specialization.
	if (TUK != TUK_Friend)
	Specialization->setSpecializationKind(TSK_ExplicitSpecialization);

	// Check that this isn't a redefinition of this specialization.
	if (TUK == TUK_Definition) {
	RecordDecl *Def = Specialization->getDefinition();
	NamedDecl *Hidden = nullptr;
	if (Def && SkipBody && !hasVisibleDefinition(Def, &Hidden)) {
	SkipBody->ShouldSkip = true;
	SkipBody->Previous = Def;
	makeMergedDefinitionVisible(Hidden);
	} else if (Def) {
	SourceRange Range(TemplateNameLoc, RAngleLoc);
	Diag(TemplateNameLoc, diag::err_redefinition) << Specialization << Range;
	Diag(Def->getLocation(), diag::note_previous_definition);
	Specialization->setInvalidDecl();
	return true;
	}
	}

	ProcessDeclAttributeList(S, Specialization, Attr);

	// Add alignment attributes if necessary; these attributes are checked when
	// the ASTContext lays out the structure.
	if (TUK == TUK_Definition && (!SkipBody \|\| !SkipBody->ShouldSkip)) {
	AddAlignmentAttributesForRecord(Specialization);
	AddMsStructLayoutForRecord(Specialization);
	}

	if (ModulePrivateLoc.isValid())
	Diag(Specialization->getLocation(), diag::err_module_private_specialization)
	<< (isPartialSpecialization? 1 : 0)
	<< FixItHint::CreateRemoval(ModulePrivateLoc);

	// Build the fully-sugared type for this class template
	// specialization as the user wrote in the specialization
	// itself. This means that we'll pretty-print the type retrieved
	// from the specialization's declaration the way that the user
	// actually wrote the specialization, rather than formatting the
	// name based on the "canonical" representation used to store the
	// template arguments in the specialization.
	TypeSourceInfo *WrittenTy
	= Context.getTemplateSpecializationTypeInfo(Name, TemplateNameLoc,
	TemplateArgs, CanonType);
	if (TUK != TUK_Friend) {
	Specialization->setTypeAsWritten(WrittenTy);
	Specialization->setTemplateKeywordLoc(TemplateKWLoc);
	}

	// C++ [temp.expl.spec]p9:
	// A template explicit specialization is in the scope of the
	// namespace in which the template was defined.
	//
	// We actually implement this paragraph where we set the semantic
	// context (in the creation of the ClassTemplateSpecializationDecl),
	// but we also maintain the lexical context where the actual
	// definition occurs.
	Specialization->setLexicalDeclContext(CurContext);

	// We may be starting the definition of this specialization.
	if (TUK == TUK_Definition && (!SkipBody \|\| !SkipBody->ShouldSkip))
	Specialization->startDefinition();

	if (TUK == TUK_Friend) {
	FriendDecl *Friend = FriendDecl::Create(Context, CurContext,
	TemplateNameLoc,
	WrittenTy,
	/FIXME:/KWLoc);
	Friend->setAccess(AS_public);
	CurContext->addDecl(Friend);
	} else {
	// Add the specialization into its lexical context, so that it can
	// be seen when iterating through the list of declarations in that
	// context. However, specializations are not found by name lookup.
	CurContext->addDecl(Specialization);
	}

	if (SkipBody && SkipBody->ShouldSkip)
	return SkipBody->Previous;

	return Specialization;
	}

	Decl Sema::ActOnTemplateDeclarator(Scope S,
	MultiTemplateParamsArg TemplateParameterLists,
	Declarator &D) {
	Decl *NewDecl = HandleDeclarator(S, D, TemplateParameterLists);
	ActOnDocumentableDecl(NewDecl);
	return NewDecl;
	}

	Decl Sema::ActOnConceptDefinition(Scope S,
	MultiTemplateParamsArg TemplateParameterLists,
	IdentifierInfo *Name, SourceLocation NameLoc,
	Expr *ConstraintExpr) {
	DeclContext *DC = CurContext;

	if (!DC->getRedeclContext()->isFileContext()) {
	Diag(NameLoc,
	diag::err_concept_decls_may_only_appear_in_global_namespace_scope);
	return nullptr;
	}

	if (TemplateParameterLists.size() > 1) {
	Diag(NameLoc, diag::err_concept_extra_headers);
	return nullptr;
	}

	if (TemplateParameterLists.front()->size() == 0) {
	Diag(NameLoc, diag::err_concept_no_parameters);
	return nullptr;
	}

	ConceptDecl *NewDecl = ConceptDecl::Create(Context, DC, NameLoc, Name,
	TemplateParameterLists.front(),
	ConstraintExpr);

	if (!ConstraintExpr->isTypeDependent() &&
	ConstraintExpr->getType() != Context.BoolTy) {
	// C++2a [temp.constr.atomic]p3:
	// E shall be a constant expression of type bool.
	// TODO: Do this check for individual atomic constraints
	// and not the constraint expression. Probably should do it in
	// ParseConstraintExpression.
	Diag(ConstraintExpr->getSourceRange().getBegin(),
	diag::err_concept_initialized_with_non_bool_type)
	<< ConstraintExpr->getType();
	NewDecl->setInvalidDecl();
	}

	if (NewDecl->getAssociatedConstraints()) {
	// C++2a [temp.concept]p4:
	// A concept shall not have associated constraints.
	// TODO: Make a test once we have actual associated constraints.
	Diag(NameLoc, diag::err_concept_no_associated_constraints);
	NewDecl->setInvalidDecl();
	}

	// Check for conflicting previous declaration.
	DeclarationNameInfo NameInfo(NewDecl->getDeclName(), NameLoc);
	LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
	ForVisibleRedeclaration);
	LookupName(Previous, S);

	FilterLookupForScope(Previous, DC, S, /ConsiderLinkage=/false,
	/AllowInlineNamespace/false);
	if (!Previous.empty()) {
	auto *Old = Previous.getRepresentativeDecl();
	Diag(NameLoc, isa<ConceptDecl>(Old) ? diag::err_redefinition :
	diag::err_redefinition_different_kind) << NewDecl->getDeclName();
	Diag(Old->getLocation(), diag::note_previous_definition);
	}

	ActOnDocumentableDecl(NewDecl);
	PushOnScopeChains(NewDecl, S);
	return NewDecl;
	}

	/// \brief Strips various properties off an implicit instantiation
	/// that has just been explicitly specialized.
	static void StripImplicitInstantiation(NamedDecl *D) {
	D->dropAttr<DLLImportAttr>();
	D->dropAttr<DLLExportAttr>();

	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
	FD->setInlineSpecified(false);
	}

	/// Compute the diagnostic location for an explicit instantiation
	// declaration or definition.
	static SourceLocation DiagLocForExplicitInstantiation(
	NamedDecl* D, SourceLocation PointOfInstantiation) {
	// Explicit instantiations following a specialization have no effect and
	// hence no PointOfInstantiation. In that case, walk decl backwards
	// until a valid name loc is found.
	SourceLocation PrevDiagLoc = PointOfInstantiation;
	for (Decl *Prev = D; Prev && !PrevDiagLoc.isValid();
	Prev = Prev->getPreviousDecl()) {
	PrevDiagLoc = Prev->getLocation();
	}
	assert(PrevDiagLoc.isValid() &&
	"Explicit instantiation without point of instantiation?");
	return PrevDiagLoc;
	}

	/// Diagnose cases where we have an explicit template specialization
	/// before/after an explicit template instantiation, producing diagnostics
	/// for those cases where they are required and determining whether the
	/// new specialization/instantiation will have any effect.
	///
	/// \param NewLoc the location of the new explicit specialization or
	/// instantiation.
	///
	/// \param NewTSK the kind of the new explicit specialization or instantiation.
	///
	/// \param PrevDecl the previous declaration of the entity.
	///
	/// \param PrevTSK the kind of the old explicit specialization or instantiatin.
	///
	/// \param PrevPointOfInstantiation if valid, indicates where the previus
	/// declaration was instantiated (either implicitly or explicitly).
	///
	/// \param HasNoEffect will be set to true to indicate that the new
	/// specialization or instantiation has no effect and should be ignored.
	///
	/// \returns true if there was an error that should prevent the introduction of
	/// the new declaration into the AST, false otherwise.
	bool
	Sema::CheckSpecializationInstantiationRedecl(SourceLocation NewLoc,
	TemplateSpecializationKind NewTSK,
	NamedDecl *PrevDecl,
	TemplateSpecializationKind PrevTSK,
	SourceLocation PrevPointOfInstantiation,
	bool &HasNoEffect) {
	HasNoEffect = false;

	switch (NewTSK) {
	case TSK_Undeclared:
	case TSK_ImplicitInstantiation:
	assert(
	(PrevTSK == TSK_Undeclared \|\| PrevTSK == TSK_ImplicitInstantiation) &&
	"previous declaration must be implicit!");
	return false;

	case TSK_ExplicitSpecialization:
	switch (PrevTSK) {
	case TSK_Undeclared:
	case TSK_ExplicitSpecialization:
	// Okay, we're just specializing something that is either already
	// explicitly specialized or has merely been mentioned without any
	// instantiation.
	return false;

	case TSK_ImplicitInstantiation:
	if (PrevPointOfInstantiation.isInvalid()) {
	// The declaration itself has not actually been instantiated, so it is
	// still okay to specialize it.
	StripImplicitInstantiation(PrevDecl);
	return false;
	}
	// Fall through
	LLVM_FALLTHROUGH;

	case TSK_ExplicitInstantiationDeclaration:
	case TSK_ExplicitInstantiationDefinition:
	assert((PrevTSK == TSK_ImplicitInstantiation \|\|
	PrevPointOfInstantiation.isValid()) &&
	"Explicit instantiation without point of instantiation?");

	// C++ [temp.expl.spec]p6:
	// If a template, a member template or the member of a class template
	// is explicitly specialized then that specialization shall be declared
	// before the first use of that specialization that would cause an
	// implicit instantiation to take place, in every translation unit in
	// which such a use occurs; no diagnostic is required.
	for (Decl *Prev = PrevDecl; Prev; Prev = Prev->getPreviousDecl()) {
	// Is there any previous explicit specialization declaration?
	if (getTemplateSpecializationKind(Prev) == TSK_ExplicitSpecialization)
	return false;
	}

	Diag(NewLoc, diag::err_specialization_after_instantiation)
	<< PrevDecl;
	Diag(PrevPointOfInstantiation, diag::note_instantiation_required_here)
	<< (PrevTSK != TSK_ImplicitInstantiation);

	return true;
	}
	llvm_unreachable("The switch over PrevTSK must be exhaustive.");

	case TSK_ExplicitInstantiationDeclaration:
	switch (PrevTSK) {
	case TSK_ExplicitInstantiationDeclaration:
	// This explicit instantiation declaration is redundant (that's okay).
	HasNoEffect = true;
	return false;

	case TSK_Undeclared:
	case TSK_ImplicitInstantiation:
	// We're explicitly instantiating something that may have already been
	// implicitly instantiated; that's fine.
	return false;

	case TSK_ExplicitSpecialization:
	// C++0x [temp.explicit]p4:
	// For a given set of template parameters, if an explicit instantiation
	// of a template appears after a declaration of an explicit
	// specialization for that template, the explicit instantiation has no
	// effect.
	HasNoEffect = true;
	return false;

	case TSK_ExplicitInstantiationDefinition:
	// C++0x [temp.explicit]p10:
	// If an entity is the subject of both an explicit instantiation
	// declaration and an explicit instantiation definition in the same
	// translation unit, the definition shall follow the declaration.
	Diag(NewLoc,
	diag::err_explicit_instantiation_declaration_after_definition);

	// Explicit instantiations following a specialization have no effect and
	// hence no PrevPointOfInstantiation. In that case, walk decl backwards
	// until a valid name loc is found.
	Diag(DiagLocForExplicitInstantiation(PrevDecl, PrevPointOfInstantiation),
	diag::note_explicit_instantiation_definition_here);
	HasNoEffect = true;
	return false;
	}
	llvm_unreachable("Unexpected TemplateSpecializationKind!");

	case TSK_ExplicitInstantiationDefinition:
	switch (PrevTSK) {
	case TSK_Undeclared:
	case TSK_ImplicitInstantiation:
	// We're explicitly instantiating something that may have already been
	// implicitly instantiated; that's fine.
	return false;

	case TSK_ExplicitSpecialization:
	// C++ DR 259, C++0x [temp.explicit]p4:
	// For a given set of template parameters, if an explicit
	// instantiation of a template appears after a declaration of
	// an explicit specialization for that template, the explicit
	// instantiation has no effect.
	Diag(NewLoc, diag::warn_explicit_instantiation_after_specialization)
	<< PrevDecl;
	Diag(PrevDecl->getLocation(),
	diag::note_previous_template_specialization);
	HasNoEffect = true;
	return false;

	case TSK_ExplicitInstantiationDeclaration:
	// We're explicitly instantiating a definition for something for which we
	// were previously asked to suppress instantiations. That's fine.

	// C++0x [temp.explicit]p4:
	// For a given set of template parameters, if an explicit instantiation
	// of a template appears after a declaration of an explicit
	// specialization for that template, the explicit instantiation has no
	// effect.
	for (Decl *Prev = PrevDecl; Prev; Prev = Prev->getPreviousDecl()) {
	// Is there any previous explicit specialization declaration?
	if (getTemplateSpecializationKind(Prev) == TSK_ExplicitSpecialization) {
	HasNoEffect = true;
	break;
	}
	}

	return false;

	case TSK_ExplicitInstantiationDefinition:
	// C++0x [temp.spec]p5:
	// For a given template and a given set of template-arguments,
	// - an explicit instantiation definition shall appear at most once
	// in a program,

	// MSVCCompat: MSVC silently ignores duplicate explicit instantiations.
	Diag(NewLoc, (getLangOpts().MSVCCompat)
	? diag::ext_explicit_instantiation_duplicate
	: diag::err_explicit_instantiation_duplicate)
	<< PrevDecl;
	Diag(DiagLocForExplicitInstantiation(PrevDecl, PrevPointOfInstantiation),
	diag::note_previous_explicit_instantiation);
	HasNoEffect = true;
	return false;
	}
	}

	llvm_unreachable("Missing specialization/instantiation case?");
	}

	/// Perform semantic analysis for the given dependent function
	/// template specialization.
	///
	/// The only possible way to get a dependent function template specialization
	/// is with a friend declaration, like so:
	///
	/// \code
	/// template \<class T> void foo(T);
	/// template \<class T> class A {
	/// friend void foo<>(T);
	/// };
	/// \endcode
	///
	/// There really isn't any useful analysis we can do here, so we
	/// just store the information.
	bool
	Sema::CheckDependentFunctionTemplateSpecialization(FunctionDecl *FD,
	const TemplateArgumentListInfo &ExplicitTemplateArgs,
	LookupResult &Previous) {
	// Remove anything from Previous that isn't a function template in
	// the correct context.
	DeclContext *FDLookupContext = FD->getDeclContext()->getRedeclContext();
	LookupResult::Filter F = Previous.makeFilter();
	enum DiscardReason { NotAFunctionTemplate, NotAMemberOfEnclosing };
	SmallVector<std::pair<DiscardReason, Decl *>, 8> DiscardedCandidates;
	while (F.hasNext()) {
	NamedDecl *D = F.next()->getUnderlyingDecl();
	if (!isa<FunctionTemplateDecl>(D)) {
	F.erase();
	DiscardedCandidates.push_back(std::make_pair(NotAFunctionTemplate, D));
	continue;
	}

	if (!FDLookupContext->InEnclosingNamespaceSetOf(
	D->getDeclContext()->getRedeclContext())) {
	F.erase();
	DiscardedCandidates.push_back(std::make_pair(NotAMemberOfEnclosing, D));
	continue;
	}
	}
	F.done();

	if (Previous.empty()) {
	Diag(FD->getLocation(),
	diag::err_dependent_function_template_spec_no_match);
	for (auto &P : DiscardedCandidates)
	Diag(P.second->getLocation(),
	diag::note_dependent_function_template_spec_discard_reason)
	<< P.first;
	return true;
	}

	FD->setDependentTemplateSpecialization(Context, Previous.asUnresolvedSet(),
	ExplicitTemplateArgs);
	return false;
	}

	/// Perform semantic analysis for the given function template
	/// specialization.
	///
	/// This routine performs all of the semantic analysis required for an
	/// explicit function template specialization. On successful completion,
	/// the function declaration \p FD will become a function template
	/// specialization.
	///
	/// \param FD the function declaration, which will be updated to become a
	/// function template specialization.
	///
	/// \param ExplicitTemplateArgs the explicitly-provided template arguments,
	/// if any. Note that this may be valid info even when 0 arguments are
	/// explicitly provided as in, e.g., \c void sort<>(char, char);
	/// as it anyway contains info on the angle brackets locations.
	///
	/// \param Previous the set of declarations that may be specialized by
	/// this function specialization.
	///
	/// \param QualifiedFriend whether this is a lookup for a qualified friend
	/// declaration with no explicit template argument list that might be
	/// befriending a function template specialization.
	bool Sema::CheckFunctionTemplateSpecialization(
	FunctionDecl FD, TemplateArgumentListInfo ExplicitTemplateArgs,
	LookupResult &Previous, bool QualifiedFriend) {
	// The set of function template specializations that could match this
	// explicit function template specialization.
	UnresolvedSet<8> Candidates;
	TemplateSpecCandidateSet FailedCandidates(FD->getLocation(),
	/ForTakingAddress=/false);

	llvm::SmallDenseMap<FunctionDecl *, TemplateArgumentListInfo, 8>
	ConvertedTemplateArgs;

	DeclContext *FDLookupContext = FD->getDeclContext()->getRedeclContext();
	for (LookupResult::iterator I = Previous.begin(), E = Previous.end();
	I != E; ++I) {
	NamedDecl Ovl = (I)->getUnderlyingDecl();
	if (FunctionTemplateDecl *FunTmpl = dyn_cast<FunctionTemplateDecl>(Ovl)) {
	// Only consider templates found within the same semantic lookup scope as
	// FD.
	if (!FDLookupContext->InEnclosingNamespaceSetOf(
	Ovl->getDeclContext()->getRedeclContext()))
	continue;

	// When matching a constexpr member function template specialization
	// against the primary template, we don't yet know whether the
	// specialization has an implicit 'const' (because we don't know whether
	// it will be a static member function until we know which template it
	// specializes), so adjust it now assuming it specializes this template.
	QualType FT = FD->getType();
	if (FD->isConstexpr()) {
	CXXMethodDecl *OldMD =
	dyn_cast<CXXMethodDecl>(FunTmpl->getTemplatedDecl());
	if (OldMD && OldMD->isConst()) {
	const FunctionProtoType *FPT = FT->castAs<FunctionProtoType>();
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	EPI.TypeQuals.addConst();
	FT = Context.getFunctionType(FPT->getReturnType(),
	FPT->getParamTypes(), EPI);
	}
	}

	TemplateArgumentListInfo Args;
	if (ExplicitTemplateArgs)
	Args = *ExplicitTemplateArgs;

	// C++ [temp.expl.spec]p11:
	// A trailing template-argument can be left unspecified in the
	// template-id naming an explicit function template specialization
	// provided it can be deduced from the function argument type.
	// Perform template argument deduction to determine whether we may be
	// specializing this template.
	// FIXME: It is somewhat wasteful to build
	TemplateDeductionInfo Info(FailedCandidates.getLocation());
	FunctionDecl *Specialization = nullptr;
	if (TemplateDeductionResult TDK = DeduceTemplateArguments(
	cast<FunctionTemplateDecl>(FunTmpl->getFirstDecl()),
	ExplicitTemplateArgs ? &Args : nullptr, FT, Specialization,
	Info)) {
	// Template argument deduction failed; record why it failed, so
	// that we can provide nifty diagnostics.
	FailedCandidates.addCandidate().set(
	I.getPair(), FunTmpl->getTemplatedDecl(),
	MakeDeductionFailureInfo(Context, TDK, Info));
	(void)TDK;
	continue;
	}

	// Target attributes are part of the cuda function signature, so
	// the deduced template's cuda target must match that of the
	// specialization. Given that C++ template deduction does not
	// take target attributes into account, we reject candidates
	// here that have a different target.
	if (LangOpts.CUDA &&
	IdentifyCUDATarget(Specialization,
	/* IgnoreImplicitHDAttr = */ true) !=
	IdentifyCUDATarget(FD, /* IgnoreImplicitHDAttr = */ true)) {
	FailedCandidates.addCandidate().set(
	I.getPair(), FunTmpl->getTemplatedDecl(),
	MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
	continue;
	}

	// Record this candidate.
	if (ExplicitTemplateArgs)
	ConvertedTemplateArgs[Specialization] = std::move(Args);
	Candidates.addDecl(Specialization, I.getAccess());
	}
	}

	// For a qualified friend declaration (with no explicit marker to indicate
	// that a template specialization was intended), note all (template and
	// non-template) candidates.
	if (QualifiedFriend && Candidates.empty()) {
	Diag(FD->getLocation(), diag::err_qualified_friend_no_match)
	<< FD->getDeclName() << FDLookupContext;
	// FIXME: We should form a single candidate list and diagnose all
	// candidates at once, to get proper sorting and limiting.
	for (auto *OldND : Previous) {
	if (auto *OldFD = dyn_cast<FunctionDecl>(OldND->getUnderlyingDecl()))
	NoteOverloadCandidate(OldND, OldFD, FD->getType(), false);
	}
	FailedCandidates.NoteCandidates(*this, FD->getLocation());
	return true;
	}

	// Find the most specialized function template.
	UnresolvedSetIterator Result = getMostSpecialized(
	Candidates.begin(), Candidates.end(), FailedCandidates, FD->getLocation(),
	PDiag(diag::err_function_template_spec_no_match) << FD->getDeclName(),
	PDiag(diag::err_function_template_spec_ambiguous)
	<< FD->getDeclName() << (ExplicitTemplateArgs != nullptr),
	PDiag(diag::note_function_template_spec_matched));

	if (Result == Candidates.end())
	return true;

	// Ignore access information; it doesn't figure into redeclaration checking.
	FunctionDecl Specialization = cast<FunctionDecl>(Result);

	FunctionTemplateSpecializationInfo *SpecInfo
	= Specialization->getTemplateSpecializationInfo();
	assert(SpecInfo && "Function template specialization info missing?");

	// Note: do not overwrite location info if previous template
	// specialization kind was explicit.
	TemplateSpecializationKind TSK = SpecInfo->getTemplateSpecializationKind();
	if (TSK == TSK_Undeclared \|\| TSK == TSK_ImplicitInstantiation) {
	Specialization->setLocation(FD->getLocation());
	Specialization->setLexicalDeclContext(FD->getLexicalDeclContext());
	// C++11 [dcl.constexpr]p1: An explicit specialization of a constexpr
	// function can differ from the template declaration with respect to
	// the constexpr specifier.
	// FIXME: We need an update record for this AST mutation.
	// FIXME: What if there are multiple such prior declarations (for instance,
	// from different modules)?
	Specialization->setConstexprKind(FD->getConstexprKind());
	}

	// FIXME: Check if the prior specialization has a point of instantiation.
	// If so, we have run afoul of .

	// If this is a friend declaration, then we're not really declaring
	// an explicit specialization.
	bool isFriend = (FD->getFriendObjectKind() != Decl::FOK_None);

	// Check the scope of this explicit specialization.
	if (!isFriend &&
	CheckTemplateSpecializationScope(*this,
	Specialization->getPrimaryTemplate(),
	Specialization, FD->getLocation(),
	false))
	return true;

	// C++ [temp.expl.spec]p6:
	// If a template, a member template or the member of a class template is
	// explicitly specialized then that specialization shall be declared
	// before the first use of that specialization that would cause an implicit
	// instantiation to take place, in every translation unit in which such a
	// use occurs; no diagnostic is required.
	bool HasNoEffect = false;
	if (!isFriend &&
	CheckSpecializationInstantiationRedecl(FD->getLocation(),
	TSK_ExplicitSpecialization,
	Specialization,
	SpecInfo->getTemplateSpecializationKind(),
	SpecInfo->getPointOfInstantiation(),
	HasNoEffect))
	return true;

	// Mark the prior declaration as an explicit specialization, so that later
	// clients know that this is an explicit specialization.
	if (!isFriend) {
	// Since explicit specializations do not inherit '=delete' from their
	// primary function template - check if the 'specialization' that was
	// implicitly generated (during template argument deduction for partial
	// ordering) from the most specialized of all the function templates that
	// 'FD' could have been specializing, has a 'deleted' definition. If so,
	// first check that it was implicitly generated during template argument
	// deduction by making sure it wasn't referenced, and then reset the deleted
	// flag to not-deleted, so that we can inherit that information from 'FD'.
	if (Specialization->isDeleted() && !SpecInfo->isExplicitSpecialization() &&
	!Specialization->getCanonicalDecl()->isReferenced()) {
	// FIXME: This assert will not hold in the presence of modules.
	assert(
	Specialization->getCanonicalDecl() == Specialization &&
	"This must be the only existing declaration of this specialization");
	// FIXME: We need an update record for this AST mutation.
	Specialization->setDeletedAsWritten(false);
	}
	// FIXME: We need an update record for this AST mutation.
	SpecInfo->setTemplateSpecializationKind(TSK_ExplicitSpecialization);
	MarkUnusedFileScopedDecl(Specialization);
	}

	// Turn the given function declaration into a function template
	// specialization, with the template arguments from the previous
	// specialization.
	// Take copies of (semantic and syntactic) template argument lists.
	const TemplateArgumentList* TemplArgs = new (Context)
	TemplateArgumentList(Specialization->getTemplateSpecializationArgs());
	FD->setFunctionTemplateSpecialization(
	Specialization->getPrimaryTemplate(), TemplArgs, /InsertPos=/nullptr,
	SpecInfo->getTemplateSpecializationKind(),
	ExplicitTemplateArgs ? &ConvertedTemplateArgs[Specialization] : nullptr);

	// A function template specialization inherits the target attributes
	// of its template. (We require the attributes explicitly in the
	// code to match, but a template may have implicit attributes by
	// virtue e.g. of being constexpr, and it passes these implicit
	// attributes on to its specializations.)
	if (LangOpts.CUDA)
	inheritCUDATargetAttrs(FD, *Specialization->getPrimaryTemplate());

	// The "previous declaration" for this function template specialization is
	// the prior function template specialization.
	Previous.clear();
	Previous.addDecl(Specialization);
	return false;
	}

	/// Perform semantic analysis for the given non-template member
	/// specialization.
	///
	/// This routine performs all of the semantic analysis required for an
	/// explicit member function specialization. On successful completion,
	/// the function declaration \p FD will become a member function
	/// specialization.
	///
	/// \param Member the member declaration, which will be updated to become a
	/// specialization.
	///
	/// \param Previous the set of declarations, one of which may be specialized
	/// by this function specialization; the set will be modified to contain the
	/// redeclared member.
	bool
	Sema::CheckMemberSpecialization(NamedDecl *Member, LookupResult &Previous) {
	assert(!isa<TemplateDecl>(Member) && "Only for non-template members");

	// Try to find the member we are instantiating.
	NamedDecl *FoundInstantiation = nullptr;
	NamedDecl *Instantiation = nullptr;
	NamedDecl *InstantiatedFrom = nullptr;
	MemberSpecializationInfo *MSInfo = nullptr;

	if (Previous.empty()) {
	// Nowhere to look anyway.
	} else if (FunctionDecl *Function = dyn_cast<FunctionDecl>(Member)) {
	for (LookupResult::iterator I = Previous.begin(), E = Previous.end();
	I != E; ++I) {
	NamedDecl D = (I)->getUnderlyingDecl();
	if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D)) {
	QualType Adjusted = Function->getType();
	if (!hasExplicitCallingConv(Adjusted))
	Adjusted = adjustCCAndNoReturn(Adjusted, Method->getType());
	// This doesn't handle deduced return types, but both function
	// declarations should be undeduced at this point.
	if (Context.hasSameType(Adjusted, Method->getType())) {
	FoundInstantiation = *I;
	Instantiation = Method;
	InstantiatedFrom = Method->getInstantiatedFromMemberFunction();
	MSInfo = Method->getMemberSpecializationInfo();
	break;
	}
	}
	}
	} else if (isa<VarDecl>(Member)) {
	VarDecl *PrevVar;
	if (Previous.isSingleResult() &&
	(PrevVar = dyn_cast<VarDecl>(Previous.getFoundDecl())))
	if (PrevVar->isStaticDataMember()) {
	FoundInstantiation = Previous.getRepresentativeDecl();
	Instantiation = PrevVar;
	InstantiatedFrom = PrevVar->getInstantiatedFromStaticDataMember();
	MSInfo = PrevVar->getMemberSpecializationInfo();
	}
	} else if (isa<RecordDecl>(Member)) {
	CXXRecordDecl *PrevRecord;
	if (Previous.isSingleResult() &&
	(PrevRecord = dyn_cast<CXXRecordDecl>(Previous.getFoundDecl()))) {
	FoundInstantiation = Previous.getRepresentativeDecl();
	Instantiation = PrevRecord;
	InstantiatedFrom = PrevRecord->getInstantiatedFromMemberClass();
	MSInfo = PrevRecord->getMemberSpecializationInfo();
	}
	} else if (isa<EnumDecl>(Member)) {
	EnumDecl *PrevEnum;
	if (Previous.isSingleResult() &&
	(PrevEnum = dyn_cast<EnumDecl>(Previous.getFoundDecl()))) {
	FoundInstantiation = Previous.getRepresentativeDecl();
	Instantiation = PrevEnum;
	InstantiatedFrom = PrevEnum->getInstantiatedFromMemberEnum();
	MSInfo = PrevEnum->getMemberSpecializationInfo();
	}
	}

	if (!Instantiation) {
	// There is no previous declaration that matches. Since member
	// specializations are always out-of-line, the caller will complain about
	// this mismatch later.
	return false;
	}

	// A member specialization in a friend declaration isn't really declaring
	// an explicit specialization, just identifying a specific (possibly implicit)
	// specialization. Don't change the template specialization kind.
	//
	// FIXME: Is this really valid? Other compilers reject.
	if (Member->getFriendObjectKind() != Decl::FOK_None) {
	// Preserve instantiation information.
	if (InstantiatedFrom && isa<CXXMethodDecl>(Member)) {
	cast<CXXMethodDecl>(Member)->setInstantiationOfMemberFunction(
	cast<CXXMethodDecl>(InstantiatedFrom),
	cast<CXXMethodDecl>(Instantiation)->getTemplateSpecializationKind());
	} else if (InstantiatedFrom && isa<CXXRecordDecl>(Member)) {
	cast<CXXRecordDecl>(Member)->setInstantiationOfMemberClass(
	cast<CXXRecordDecl>(InstantiatedFrom),
	cast<CXXRecordDecl>(Instantiation)->getTemplateSpecializationKind());
	}

	Previous.clear();
	Previous.addDecl(FoundInstantiation);
	return false;
	}

	// Make sure that this is a specialization of a member.
	if (!InstantiatedFrom) {
	Diag(Member->getLocation(), diag::err_spec_member_not_instantiated)
	<< Member;
	Diag(Instantiation->getLocation(), diag::note_specialized_decl);
	return true;
	}

	// C++ [temp.expl.spec]p6:
	// If a template, a member template or the member of a class template is
	// explicitly specialized then that specialization shall be declared
	// before the first use of that specialization that would cause an implicit
	// instantiation to take place, in every translation unit in which such a
	// use occurs; no diagnostic is required.
	assert(MSInfo && "Member specialization info missing?");

	bool HasNoEffect = false;
	if (CheckSpecializationInstantiationRedecl(Member->getLocation(),
	TSK_ExplicitSpecialization,
	Instantiation,
	MSInfo->getTemplateSpecializationKind(),
	MSInfo->getPointOfInstantiation(),
	HasNoEffect))
	return true;

	// Check the scope of this explicit specialization.
	if (CheckTemplateSpecializationScope(*this,
	InstantiatedFrom,
	Instantiation, Member->getLocation(),
	false))
	return true;

	// Note that this member specialization is an "instantiation of" the
	// corresponding member of the original template.
	if (auto *MemberFunction = dyn_cast<FunctionDecl>(Member)) {
	FunctionDecl *InstantiationFunction = cast<FunctionDecl>(Instantiation);
	if (InstantiationFunction->getTemplateSpecializationKind() ==
	TSK_ImplicitInstantiation) {
	// Explicit specializations of member functions of class templates do not
	// inherit '=delete' from the member function they are specializing.
	if (InstantiationFunction->isDeleted()) {
	// FIXME: This assert will not hold in the presence of modules.
	assert(InstantiationFunction->getCanonicalDecl() ==
	InstantiationFunction);
	// FIXME: We need an update record for this AST mutation.
	InstantiationFunction->setDeletedAsWritten(false);
	}
	}

	MemberFunction->setInstantiationOfMemberFunction(
	cast<CXXMethodDecl>(InstantiatedFrom), TSK_ExplicitSpecialization);
	} else if (auto *MemberVar = dyn_cast<VarDecl>(Member)) {
	MemberVar->setInstantiationOfStaticDataMember(
	cast<VarDecl>(InstantiatedFrom), TSK_ExplicitSpecialization);
	} else if (auto *MemberClass = dyn_cast<CXXRecordDecl>(Member)) {
	MemberClass->setInstantiationOfMemberClass(
	cast<CXXRecordDecl>(InstantiatedFrom), TSK_ExplicitSpecialization);
	} else if (auto *MemberEnum = dyn_cast<EnumDecl>(Member)) {
	MemberEnum->setInstantiationOfMemberEnum(
	cast<EnumDecl>(InstantiatedFrom), TSK_ExplicitSpecialization);
	} else {
	llvm_unreachable("unknown member specialization kind");
	}

	// Save the caller the trouble of having to figure out which declaration
	// this specialization matches.
	Previous.clear();
	Previous.addDecl(FoundInstantiation);
	return false;
	}

	/// Complete the explicit specialization of a member of a class template by
	/// updating the instantiated member to be marked as an explicit specialization.
	///
	/// \param OrigD The member declaration instantiated from the template.
	/// \param Loc The location of the explicit specialization of the member.
	template<typename DeclT>
	static void completeMemberSpecializationImpl(Sema &S, DeclT *OrigD,
	SourceLocation Loc) {
	if (OrigD->getTemplateSpecializationKind() != TSK_ImplicitInstantiation)
	return;

	// FIXME: Inform AST mutation listeners of this AST mutation.
	// FIXME: If there are multiple in-class declarations of the member (from
	// multiple modules, or a declaration and later definition of a member type),
	// should we update all of them?
	OrigD->setTemplateSpecializationKind(TSK_ExplicitSpecialization);
	OrigD->setLocation(Loc);
	}

	void Sema::CompleteMemberSpecialization(NamedDecl *Member,
	LookupResult &Previous) {
	NamedDecl *Instantiation = cast<NamedDecl>(Member->getCanonicalDecl());
	if (Instantiation == Member)
	return;

	if (auto *Function = dyn_cast<CXXMethodDecl>(Instantiation))
	completeMemberSpecializationImpl(*this, Function, Member->getLocation());
	else if (auto *Var = dyn_cast<VarDecl>(Instantiation))
	completeMemberSpecializationImpl(*this, Var, Member->getLocation());
	else if (auto *Record = dyn_cast<CXXRecordDecl>(Instantiation))
	completeMemberSpecializationImpl(*this, Record, Member->getLocation());
	else if (auto *Enum = dyn_cast<EnumDecl>(Instantiation))
	completeMemberSpecializationImpl(*this, Enum, Member->getLocation());
	else
	llvm_unreachable("unknown member specialization kind");
	}

	/// Check the scope of an explicit instantiation.
	///
	/// \returns true if a serious error occurs, false otherwise.
	static bool CheckExplicitInstantiationScope(Sema &S, NamedDecl *D,
	SourceLocation InstLoc,
	bool WasQualifiedName) {
	DeclContext *OrigContext= D->getDeclContext()->getEnclosingNamespaceContext();
	DeclContext *CurContext = S.CurContext->getRedeclContext();

	if (CurContext->isRecord()) {
	S.Diag(InstLoc, diag::err_explicit_instantiation_in_class)
	<< D;
	return true;
	}

	// C++11 [temp.explicit]p3:
	// An explicit instantiation shall appear in an enclosing namespace of its
	// template. If the name declared in the explicit instantiation is an
	// unqualified name, the explicit instantiation shall appear in the
	// namespace where its template is declared or, if that namespace is inline
	// (7.3.1), any namespace from its enclosing namespace set.
	//
	// This is DR275, which we do not retroactively apply to C++98/03.
	if (WasQualifiedName) {
	if (CurContext->Encloses(OrigContext))
	return false;
	} else {
	if (CurContext->InEnclosingNamespaceSetOf(OrigContext))
	return false;
	}

	if (NamespaceDecl *NS = dyn_cast<NamespaceDecl>(OrigContext)) {
	if (WasQualifiedName)
	S.Diag(InstLoc,
	S.getLangOpts().CPlusPlus11?
	diag::err_explicit_instantiation_out_of_scope :
	diag::warn_explicit_instantiation_out_of_scope_0x)
	<< D << NS;
	else
	S.Diag(InstLoc,
	S.getLangOpts().CPlusPlus11?
	diag::err_explicit_instantiation_unqualified_wrong_namespace :
	diag::warn_explicit_instantiation_unqualified_wrong_namespace_0x)
	<< D << NS;
	} else
	S.Diag(InstLoc,
	S.getLangOpts().CPlusPlus11?
	diag::err_explicit_instantiation_must_be_global :
	diag::warn_explicit_instantiation_must_be_global_0x)
	<< D;
	S.Diag(D->getLocation(), diag::note_explicit_instantiation_here);
	return false;
	}

	/// Common checks for whether an explicit instantiation of \p D is valid.
	static bool CheckExplicitInstantiation(Sema &S, NamedDecl *D,
	SourceLocation InstLoc,
	bool WasQualifiedName,
	TemplateSpecializationKind TSK) {
	// C++ [temp.explicit]p13:
	// An explicit instantiation declaration shall not name a specialization of
	// a template with internal linkage.
	if (TSK == TSK_ExplicitInstantiationDeclaration &&
	D->getFormalLinkage() == InternalLinkage) {
	S.Diag(InstLoc, diag::err_explicit_instantiation_internal_linkage) << D;
	return true;
	}

	// C++11 [temp.explicit]p3: [DR 275]
	// An explicit instantiation shall appear in an enclosing namespace of its
	// template.
	if (CheckExplicitInstantiationScope(S, D, InstLoc, WasQualifiedName))
	return true;

	return false;
	}

	/// Determine whether the given scope specifier has a template-id in it.
	static bool ScopeSpecifierHasTemplateId(const CXXScopeSpec &SS) {
	if (!SS.isSet())
	return false;

	// C++11 [temp.explicit]p3:
	// If the explicit instantiation is for a member function, a member class
	// or a static data member of a class template specialization, the name of
	// the class template specialization in the qualified-id for the member
	// name shall be a simple-template-id.
	//
	// C++98 has the same restriction, just worded differently.
	for (NestedNameSpecifier *NNS = SS.getScopeRep(); NNS;
	NNS = NNS->getPrefix())
	if (const Type *T = NNS->getAsType())
	if (isa<TemplateSpecializationType>(T))
	return true;

	return false;
	}

	/// Make a dllexport or dllimport attr on a class template specialization take
	/// effect.
	static void dllExportImportClassTemplateSpecialization(
	Sema &S, ClassTemplateSpecializationDecl *Def) {
	auto *A = cast_or_null<InheritableAttr>(getDLLAttr(Def));
	assert(A && "dllExportImportClassTemplateSpecialization called "
	"on Def without dllexport or dllimport");

	// We reject explicit instantiations in class scope, so there should
	// never be any delayed exported classes to worry about.
	assert(S.DelayedDllExportClasses.empty() &&
	"delayed exports present at explicit instantiation");
	S.checkClassLevelDLLAttribute(Def);

	// Propagate attribute to base class templates.
	for (auto &B : Def->bases()) {
	if (auto *BT = dyn_cast_or_null<ClassTemplateSpecializationDecl>(
	B.getType()->getAsCXXRecordDecl()))
	S.propagateDLLAttrToBaseClassTemplate(Def, A, BT, B.getBeginLoc());
	}

	S.referenceDLLExportedClassMethods();
	}

	// Explicit instantiation of a class template specialization
	DeclResult Sema::ActOnExplicitInstantiation(
	Scope *S, SourceLocation ExternLoc, SourceLocation TemplateLoc,
	unsigned TagSpec, SourceLocation KWLoc, const CXXScopeSpec &SS,
	TemplateTy TemplateD, SourceLocation TemplateNameLoc,
	SourceLocation LAngleLoc, ASTTemplateArgsPtr TemplateArgsIn,
	SourceLocation RAngleLoc, const ParsedAttributesView &Attr) {
	// Find the class template we're specializing
	TemplateName Name = TemplateD.get();
	TemplateDecl *TD = Name.getAsTemplateDecl();
	// Check that the specialization uses the same tag kind as the
	// original template.
	TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
	assert(Kind != TTK_Enum &&
	"Invalid enum tag in class template explicit instantiation!");

	ClassTemplateDecl *ClassTemplate = dyn_cast<ClassTemplateDecl>(TD);

	if (!ClassTemplate) {
	NonTagKind NTK = getNonTagTypeDeclKind(TD, Kind);
	Diag(TemplateNameLoc, diag::err_tag_reference_non_tag) << TD << NTK << Kind;
	Diag(TD->getLocation(), diag::note_previous_use);
	return true;
	}

	if (!isAcceptableTagRedeclaration(ClassTemplate->getTemplatedDecl(),
	Kind, /isDefinition/false, KWLoc,
	ClassTemplate->getIdentifier())) {
	Diag(KWLoc, diag::err_use_with_wrong_tag)
	<< ClassTemplate
	<< FixItHint::CreateReplacement(KWLoc,
	ClassTemplate->getTemplatedDecl()->getKindName());
	Diag(ClassTemplate->getTemplatedDecl()->getLocation(),
	diag::note_previous_use);
	Kind = ClassTemplate->getTemplatedDecl()->getTagKind();
	}

	// C++0x [temp.explicit]p2:
	// There are two forms of explicit instantiation: an explicit instantiation
	// definition and an explicit instantiation declaration. An explicit
	// instantiation declaration begins with the extern keyword. [...]
	TemplateSpecializationKind TSK = ExternLoc.isInvalid()
	? TSK_ExplicitInstantiationDefinition
	: TSK_ExplicitInstantiationDeclaration;

	if (TSK == TSK_ExplicitInstantiationDeclaration &&
	!Context.getTargetInfo().getTriple().isWindowsGNUEnvironment()) {
	// Check for dllexport class template instantiation declarations,
	// except for MinGW mode.
	for (const ParsedAttr &AL : Attr) {
	if (AL.getKind() == ParsedAttr::AT_DLLExport) {
	Diag(ExternLoc,
	diag::warn_attribute_dllexport_explicit_instantiation_decl);
	Diag(AL.getLoc(), diag::note_attribute);
	break;
	}
	}

	if (auto *A = ClassTemplate->getTemplatedDecl()->getAttr<DLLExportAttr>()) {
	Diag(ExternLoc,
	diag::warn_attribute_dllexport_explicit_instantiation_decl);
	Diag(A->getLocation(), diag::note_attribute);
	}
	}

	// In MSVC mode, dllimported explicit instantiation definitions are treated as
	// instantiation declarations for most purposes.
	bool DLLImportExplicitInstantiationDef = false;
	if (TSK == TSK_ExplicitInstantiationDefinition &&
	Context.getTargetInfo().getCXXABI().isMicrosoft()) {
	// Check for dllimport class template instantiation definitions.
	bool DLLImport =
	ClassTemplate->getTemplatedDecl()->getAttr<DLLImportAttr>();
	for (const ParsedAttr &AL : Attr) {
	if (AL.getKind() == ParsedAttr::AT_DLLImport)
	DLLImport = true;
	if (AL.getKind() == ParsedAttr::AT_DLLExport) {
	// dllexport trumps dllimport here.
	DLLImport = false;
	break;
	}
	}
	if (DLLImport) {
	TSK = TSK_ExplicitInstantiationDeclaration;
	DLLImportExplicitInstantiationDef = true;
	}
	}

	// Translate the parser's template argument list in our AST format.
	TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
	translateTemplateArguments(TemplateArgsIn, TemplateArgs);

	// Check that the template argument list is well-formed for this
	// template.
	SmallVector<TemplateArgument, 4> Converted;
	if (CheckTemplateArgumentList(ClassTemplate, TemplateNameLoc,
	TemplateArgs, false, Converted))
	return true;

	// Find the class template specialization declaration that
	// corresponds to these arguments.
	void *InsertPos = nullptr;
	ClassTemplateSpecializationDecl *PrevDecl
	= ClassTemplate->findSpecialization(Converted, InsertPos);

	TemplateSpecializationKind PrevDecl_TSK
	= PrevDecl ? PrevDecl->getTemplateSpecializationKind() : TSK_Undeclared;

	if (TSK == TSK_ExplicitInstantiationDefinition && PrevDecl != nullptr &&
	Context.getTargetInfo().getTriple().isWindowsGNUEnvironment()) {
	// Check for dllexport class template instantiation definitions in MinGW
	// mode, if a previous declaration of the instantiation was seen.
	for (const ParsedAttr &AL : Attr) {
	if (AL.getKind() == ParsedAttr::AT_DLLExport) {
	Diag(AL.getLoc(),
	diag::warn_attribute_dllexport_explicit_instantiation_def);
	break;
	}
	}
	}

	if (CheckExplicitInstantiation(*this, ClassTemplate, TemplateNameLoc,
	SS.isSet(), TSK))
	return true;

	ClassTemplateSpecializationDecl *Specialization = nullptr;

	bool HasNoEffect = false;
	if (PrevDecl) {
	if (CheckSpecializationInstantiationRedecl(TemplateNameLoc, TSK,
	PrevDecl, PrevDecl_TSK,
	PrevDecl->getPointOfInstantiation(),
	HasNoEffect))
	return PrevDecl;

	// Even though HasNoEffect == true means that this explicit instantiation
	// has no effect on semantics, we go on to put its syntax in the AST.

	if (PrevDecl_TSK == TSK_ImplicitInstantiation \|\|
	PrevDecl_TSK == TSK_Undeclared) {
	// Since the only prior class template specialization with these
	// arguments was referenced but not declared, reuse that
	// declaration node as our own, updating the source location
	// for the template name to reflect our new declaration.
	// (Other source locations will be updated later.)
	Specialization = PrevDecl;
	Specialization->setLocation(TemplateNameLoc);
	PrevDecl = nullptr;
	}

	if (PrevDecl_TSK == TSK_ExplicitInstantiationDeclaration &&
	DLLImportExplicitInstantiationDef) {
	// The new specialization might add a dllimport attribute.
	HasNoEffect = false;
	}
	}

	if (!Specialization) {
	// Create a new class template specialization declaration node for
	// this explicit specialization.
	Specialization
	= ClassTemplateSpecializationDecl::Create(Context, Kind,
	ClassTemplate->getDeclContext(),
	KWLoc, TemplateNameLoc,
	ClassTemplate,
	Converted,
	PrevDecl);
	SetNestedNameSpecifier(*this, Specialization, SS);

	if (!HasNoEffect && !PrevDecl) {
	// Insert the new specialization.
	ClassTemplate->AddSpecialization(Specialization, InsertPos);
	}
	}

	// Build the fully-sugared type for this explicit instantiation as
	// the user wrote in the explicit instantiation itself. This means
	// that we'll pretty-print the type retrieved from the
	// specialization's declaration the way that the user actually wrote
	// the explicit instantiation, rather than formatting the name based
	// on the "canonical" representation used to store the template
	// arguments in the specialization.
	TypeSourceInfo *WrittenTy
	= Context.getTemplateSpecializationTypeInfo(Name, TemplateNameLoc,
	TemplateArgs,
	Context.getTypeDeclType(Specialization));
	Specialization->setTypeAsWritten(WrittenTy);

	// Set source locations for keywords.
	Specialization->setExternLoc(ExternLoc);
	Specialization->setTemplateKeywordLoc(TemplateLoc);
	Specialization->setBraceRange(SourceRange());

	bool PreviouslyDLLExported = Specialization->hasAttr<DLLExportAttr>();
	ProcessDeclAttributeList(S, Specialization, Attr);

	// Add the explicit instantiation into its lexical context. However,
	// since explicit instantiations are never found by name lookup, we
	// just put it into the declaration context directly.
	Specialization->setLexicalDeclContext(CurContext);
	CurContext->addDecl(Specialization);

	// Syntax is now OK, so return if it has no other effect on semantics.
	if (HasNoEffect) {
	// Set the template specialization kind.
	Specialization->setTemplateSpecializationKind(TSK);
	return Specialization;
	}

	// C++ [temp.explicit]p3:
	// A definition of a class template or class member template
	// shall be in scope at the point of the explicit instantiation of
	// the class template or class member template.
	//
	// This check comes when we actually try to perform the
	// instantiation.
	ClassTemplateSpecializationDecl *Def
	= cast_or_null<ClassTemplateSpecializationDecl>(
	Specialization->getDefinition());
	if (!Def)
	InstantiateClassTemplateSpecialization(TemplateNameLoc, Specialization, TSK);
	else if (TSK == TSK_ExplicitInstantiationDefinition) {
	MarkVTableUsed(TemplateNameLoc, Specialization, true);
	Specialization->setPointOfInstantiation(Def->getPointOfInstantiation());
	}

	// Instantiate the members of this class template specialization.
	Def = cast_or_null<ClassTemplateSpecializationDecl>(
	Specialization->getDefinition());
	if (Def) {
	TemplateSpecializationKind Old_TSK = Def->getTemplateSpecializationKind();
	// Fix a TSK_ExplicitInstantiationDeclaration followed by a
	// TSK_ExplicitInstantiationDefinition
	if (Old_TSK == TSK_ExplicitInstantiationDeclaration &&
	(TSK == TSK_ExplicitInstantiationDefinition \|\|
	DLLImportExplicitInstantiationDef)) {
	// FIXME: Need to notify the ASTMutationListener that we did this.
	Def->setTemplateSpecializationKind(TSK);

	if (!getDLLAttr(Def) && getDLLAttr(Specialization) &&
	(Context.getTargetInfo().getCXXABI().isMicrosoft() \|\|
	Context.getTargetInfo().getTriple().isWindowsItaniumEnvironment())) {
	// In the MS ABI, an explicit instantiation definition can add a dll
	// attribute to a template with a previous instantiation declaration.
	// MinGW doesn't allow this.
	auto *A = cast<InheritableAttr>(
	getDLLAttr(Specialization)->clone(getASTContext()));
	A->setInherited(true);
	Def->addAttr(A);
	dllExportImportClassTemplateSpecialization(*this, Def);
	}
	}

	// Fix a TSK_ImplicitInstantiation followed by a
	// TSK_ExplicitInstantiationDefinition
	bool NewlyDLLExported =
	!PreviouslyDLLExported && Specialization->hasAttr<DLLExportAttr>();
	if (Old_TSK == TSK_ImplicitInstantiation && NewlyDLLExported &&
	(Context.getTargetInfo().getCXXABI().isMicrosoft() \|\|
	Context.getTargetInfo().getTriple().isWindowsItaniumEnvironment())) {
	// In the MS ABI, an explicit instantiation definition can add a dll
	// attribute to a template with a previous implicit instantiation.
	// MinGW doesn't allow this. We limit clang to only adding dllexport, to
	// avoid potentially strange codegen behavior. For example, if we extend
	// this conditional to dllimport, and we have a source file calling a
	// method on an implicitly instantiated template class instance and then
	// declaring a dllimport explicit instantiation definition for the same
	// template class, the codegen for the method call will not respect the
	// dllimport, while it will with cl. The Def will already have the DLL
	// attribute, since the Def and Specialization will be the same in the
	// case of Old_TSK == TSK_ImplicitInstantiation, and we already added the
	// attribute to the Specialization; we just need to make it take effect.
	assert(Def == Specialization &&
	"Def and Specialization should match for implicit instantiation");
	dllExportImportClassTemplateSpecialization(*this, Def);
	}

	// In MinGW mode, export the template instantiation if the declaration
	// was marked dllexport.
	if (PrevDecl_TSK == TSK_ExplicitInstantiationDeclaration &&
	Context.getTargetInfo().getTriple().isWindowsGNUEnvironment() &&
	PrevDecl->hasAttr<DLLExportAttr>()) {
	dllExportImportClassTemplateSpecialization(*this, Def);
	}

	// Set the template specialization kind. Make sure it is set before
	// instantiating the members which will trigger ASTConsumer callbacks.
	Specialization->setTemplateSpecializationKind(TSK);
	InstantiateClassTemplateSpecializationMembers(TemplateNameLoc, Def, TSK);
	} else {

	// Set the template specialization kind.
	Specialization->setTemplateSpecializationKind(TSK);
	}

	return Specialization;
	}

	// Explicit instantiation of a member class of a class template.
	DeclResult
	Sema::ActOnExplicitInstantiation(Scope *S, SourceLocation ExternLoc,
	SourceLocation TemplateLoc, unsigned TagSpec,
	SourceLocation KWLoc, CXXScopeSpec &SS,
	IdentifierInfo *Name, SourceLocation NameLoc,
	const ParsedAttributesView &Attr) {

	bool Owned = false;
	bool IsDependent = false;
	Decl *TagD = ActOnTag(S, TagSpec, Sema::TUK_Reference,
	KWLoc, SS, Name, NameLoc, Attr, AS_none,
	/ModulePrivateLoc=/SourceLocation(),
	MultiTemplateParamsArg(), Owned, IsDependent,
	SourceLocation(), false, TypeResult(),
	/IsTypeSpecifier/false,
	/IsTemplateParamOrArg/false);
	assert(!IsDependent && "explicit instantiation of dependent name not yet handled");

	if (!TagD)
	return true;

	TagDecl *Tag = cast<TagDecl>(TagD);
	assert(!Tag->isEnum() && "shouldn't see enumerations here");

	if (Tag->isInvalidDecl())
	return true;

	CXXRecordDecl *Record = cast<CXXRecordDecl>(Tag);
	CXXRecordDecl *Pattern = Record->getInstantiatedFromMemberClass();
	if (!Pattern) {
	Diag(TemplateLoc, diag::err_explicit_instantiation_nontemplate_type)
	<< Context.getTypeDeclType(Record);
	Diag(Record->getLocation(), diag::note_nontemplate_decl_here);
	return true;
	}

	// C++0x [temp.explicit]p2:
	// If the explicit instantiation is for a class or member class, the
	// elaborated-type-specifier in the declaration shall include a
	// simple-template-id.
	//
	// C++98 has the same restriction, just worded differently.
	if (!ScopeSpecifierHasTemplateId(SS))
	Diag(TemplateLoc, diag::ext_explicit_instantiation_without_qualified_id)
	<< Record << SS.getRange();

	// C++0x [temp.explicit]p2:
	// There are two forms of explicit instantiation: an explicit instantiation
	// definition and an explicit instantiation declaration. An explicit
	// instantiation declaration begins with the extern keyword. [...]
	TemplateSpecializationKind TSK
	= ExternLoc.isInvalid()? TSK_ExplicitInstantiationDefinition
	: TSK_ExplicitInstantiationDeclaration;

	CheckExplicitInstantiation(*this, Record, NameLoc, true, TSK);

	// Verify that it is okay to explicitly instantiate here.
	CXXRecordDecl *PrevDecl
	= cast_or_null<CXXRecordDecl>(Record->getPreviousDecl());
	if (!PrevDecl && Record->getDefinition())
	PrevDecl = Record;
	if (PrevDecl) {
	MemberSpecializationInfo *MSInfo = PrevDecl->getMemberSpecializationInfo();
	bool HasNoEffect = false;
	assert(MSInfo && "No member specialization information?");
	if (CheckSpecializationInstantiationRedecl(TemplateLoc, TSK,
	PrevDecl,
	MSInfo->getTemplateSpecializationKind(),
	MSInfo->getPointOfInstantiation(),
	HasNoEffect))
	return true;
	if (HasNoEffect)
	return TagD;
	}

	CXXRecordDecl *RecordDef
	= cast_or_null<CXXRecordDecl>(Record->getDefinition());
	if (!RecordDef) {
	// C++ [temp.explicit]p3:
	// A definition of a member class of a class template shall be in scope
	// at the point of an explicit instantiation of the member class.
	CXXRecordDecl *Def
	= cast_or_null<CXXRecordDecl>(Pattern->getDefinition());
	if (!Def) {
	Diag(TemplateLoc, diag::err_explicit_instantiation_undefined_member)
	<< 0 << Record->getDeclName() << Record->getDeclContext();
	Diag(Pattern->getLocation(), diag::note_forward_declaration)
	<< Pattern;
	return true;
	} else {
	if (InstantiateClass(NameLoc, Record, Def,
	getTemplateInstantiationArgs(Record),
	TSK))
	return true;

	RecordDef = cast_or_null<CXXRecordDecl>(Record->getDefinition());
	if (!RecordDef)
	return true;
	}
	}

	// Instantiate all of the members of the class.
	InstantiateClassMembers(NameLoc, RecordDef,
	getTemplateInstantiationArgs(Record), TSK);

	if (TSK == TSK_ExplicitInstantiationDefinition)
	MarkVTableUsed(NameLoc, RecordDef, true);

	// FIXME: We don't have any representation for explicit instantiations of
	// member classes. Such a representation is not needed for compilation, but it
	// should be available for clients that want to see all of the declarations in
	// the source code.
	return TagD;
	}

	DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
	SourceLocation ExternLoc,
	SourceLocation TemplateLoc,
	Declarator &D) {
	// Explicit instantiations always require a name.
	// TODO: check if/when DNInfo should replace Name.
	DeclarationNameInfo NameInfo = GetNameForDeclarator(D);
	DeclarationName Name = NameInfo.getName();
	if (!Name) {
	if (!D.isInvalidType())
	Diag(D.getDeclSpec().getBeginLoc(),
	diag::err_explicit_instantiation_requires_name)
	<< D.getDeclSpec().getSourceRange() << D.getSourceRange();

	return true;
	}

	// The scope passed in may not be a decl scope. Zip up the scope tree until
	// we find one that is.
	while ((S->getFlags() & Scope::DeclScope) == 0 \|\|
	(S->getFlags() & Scope::TemplateParamScope) != 0)
	S = S->getParent();

	// Determine the type of the declaration.
	TypeSourceInfo *T = GetTypeForDeclarator(D, S);
	QualType R = T->getType();
	if (R.isNull())
	return true;

	// C++ [dcl.stc]p1:
	// A storage-class-specifier shall not be specified in [...] an explicit
	// instantiation (14.7.2) directive.
	if (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_typedef) {
	Diag(D.getIdentifierLoc(), diag::err_explicit_instantiation_of_typedef)
	<< Name;
	return true;
	} else if (D.getDeclSpec().getStorageClassSpec()
	!= DeclSpec::SCS_unspecified) {
	// Complain about then remove the storage class specifier.
	Diag(D.getIdentifierLoc(), diag::err_explicit_instantiation_storage_class)
	<< FixItHint::CreateRemoval(D.getDeclSpec().getStorageClassSpecLoc());

	D.getMutableDeclSpec().ClearStorageClassSpecs();
	}

	// C++0x [temp.explicit]p1:
	// [...] An explicit instantiation of a function template shall not use the
	// inline or constexpr specifiers.
	// Presumably, this also applies to member functions of class templates as
	// well.
	if (D.getDeclSpec().isInlineSpecified())
	Diag(D.getDeclSpec().getInlineSpecLoc(),
	getLangOpts().CPlusPlus11 ?
	diag::err_explicit_instantiation_inline :
	diag::warn_explicit_instantiation_inline_0x)
	<< FixItHint::CreateRemoval(D.getDeclSpec().getInlineSpecLoc());
	if (D.getDeclSpec().hasConstexprSpecifier() && R->isFunctionType())
	// FIXME: Add a fix-it to remove the 'constexpr' and add a 'const' if one is
	// not already specified.
	Diag(D.getDeclSpec().getConstexprSpecLoc(),
	diag::err_explicit_instantiation_constexpr);

	// A deduction guide is not on the list of entities that can be explicitly
	// instantiated.
	if (Name.getNameKind() == DeclarationName::CXXDeductionGuideName) {
	Diag(D.getDeclSpec().getBeginLoc(), diag::err_deduction_guide_specialized)
	<< /explicit instantiation/ 0;
	return true;
	}

	// C++0x [temp.explicit]p2:
	// There are two forms of explicit instantiation: an explicit instantiation
	// definition and an explicit instantiation declaration. An explicit
	// instantiation declaration begins with the extern keyword. [...]
	TemplateSpecializationKind TSK
	= ExternLoc.isInvalid()? TSK_ExplicitInstantiationDefinition
	: TSK_ExplicitInstantiationDeclaration;

	LookupResult Previous(*this, NameInfo, LookupOrdinaryName);
	LookupParsedName(Previous, S, &D.getCXXScopeSpec());

	if (!R->isFunctionType()) {
	// C++ [temp.explicit]p1:
	// A [...] static data member of a class template can be explicitly
	// instantiated from the member definition associated with its class
	// template.
	// C++1y [temp.explicit]p1:
	// A [...] variable [...] template specialization can be explicitly
	// instantiated from its template.
	if (Previous.isAmbiguous())
	return true;

	VarDecl *Prev = Previous.getAsSingle<VarDecl>();
	VarTemplateDecl *PrevTemplate = Previous.getAsSingle<VarTemplateDecl>();

	if (!PrevTemplate) {
	if (!Prev \|\| !Prev->isStaticDataMember()) {
	// We expect to see a static data member here.
	Diag(D.getIdentifierLoc(), diag::err_explicit_instantiation_not_known)
	<< Name;
	for (LookupResult::iterator P = Previous.begin(), PEnd = Previous.end();
	P != PEnd; ++P)
	Diag((*P)->getLocation(), diag::note_explicit_instantiation_here);
	return true;
	}

	if (!Prev->getInstantiatedFromStaticDataMember()) {
	// FIXME: Check for explicit specialization?
	Diag(D.getIdentifierLoc(),
	diag::err_explicit_instantiation_data_member_not_instantiated)
	<< Prev;
	Diag(Prev->getLocation(), diag::note_explicit_instantiation_here);
	// FIXME: Can we provide a note showing where this was declared?
	return true;
	}
	} else {
	// Explicitly instantiate a variable template.

	// C++1y [dcl.spec.auto]p6:
	// ... A program that uses auto or decltype(auto) in a context not
	// explicitly allowed in this section is ill-formed.
	//
	// This includes auto-typed variable template instantiations.
	if (R->isUndeducedType()) {
	Diag(T->getTypeLoc().getBeginLoc(),
	diag::err_auto_not_allowed_var_inst);
	return true;
	}

	if (D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId) {
	// C++1y [temp.explicit]p3:
	// If the explicit instantiation is for a variable, the unqualified-id
	// in the declaration shall be a template-id.
	Diag(D.getIdentifierLoc(),
	diag::err_explicit_instantiation_without_template_id)
	<< PrevTemplate;
	Diag(PrevTemplate->getLocation(),
	diag::note_explicit_instantiation_here);
	return true;
	}

	// Translate the parser's template argument list into our AST format.
	TemplateArgumentListInfo TemplateArgs =
	makeTemplateArgumentListInfo(this, D.getName().TemplateId);

	DeclResult Res = CheckVarTemplateId(PrevTemplate, TemplateLoc,
	D.getIdentifierLoc(), TemplateArgs);
	if (Res.isInvalid())
	return true;

	// Ignore access control bits, we don't need them for redeclaration
	// checking.
	Prev = cast<VarDecl>(Res.get());
	}

	// C++0x [temp.explicit]p2:
	// If the explicit instantiation is for a member function, a member class
	// or a static data member of a class template specialization, the name of
	// the class template specialization in the qualified-id for the member
	// name shall be a simple-template-id.
	//
	// C++98 has the same restriction, just worded differently.
	//
	// This does not apply to variable template specializations, where the
	// template-id is in the unqualified-id instead.
	if (!ScopeSpecifierHasTemplateId(D.getCXXScopeSpec()) && !PrevTemplate)
	Diag(D.getIdentifierLoc(),
	diag::ext_explicit_instantiation_without_qualified_id)
	<< Prev << D.getCXXScopeSpec().getRange();

	CheckExplicitInstantiation(*this, Prev, D.getIdentifierLoc(), true, TSK);

	// Verify that it is okay to explicitly instantiate here.
	TemplateSpecializationKind PrevTSK = Prev->getTemplateSpecializationKind();
	SourceLocation POI = Prev->getPointOfInstantiation();
	bool HasNoEffect = false;
	if (CheckSpecializationInstantiationRedecl(D.getIdentifierLoc(), TSK, Prev,
	PrevTSK, POI, HasNoEffect))
	return true;

	if (!HasNoEffect) {
	// Instantiate static data member or variable template.
	Prev->setTemplateSpecializationKind(TSK, D.getIdentifierLoc());
	// Merge attributes.
	ProcessDeclAttributeList(S, Prev, D.getDeclSpec().getAttributes());
	if (TSK == TSK_ExplicitInstantiationDefinition)
	InstantiateVariableDefinition(D.getIdentifierLoc(), Prev);
	}

	// Check the new variable specialization against the parsed input.
	if (PrevTemplate && Prev && !Context.hasSameType(Prev->getType(), R)) {
	Diag(T->getTypeLoc().getBeginLoc(),
	diag::err_invalid_var_template_spec_type)
	<< 0 << PrevTemplate << R << Prev->getType();
	Diag(PrevTemplate->getLocation(), diag::note_template_declared_here)
	<< 2 << PrevTemplate->getDeclName();
	return true;
	}

	// FIXME: Create an ExplicitInstantiation node?
	return (Decl*) nullptr;
	}

	// If the declarator is a template-id, translate the parser's template
	// argument list into our AST format.
	bool HasExplicitTemplateArgs = false;
	TemplateArgumentListInfo TemplateArgs;
	if (D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId) {
	TemplateArgs = makeTemplateArgumentListInfo(this, D.getName().TemplateId);
	HasExplicitTemplateArgs = true;
	}

	// C++ [temp.explicit]p1:
	// A [...] function [...] can be explicitly instantiated from its template.
	// A member function [...] of a class template can be explicitly
	// instantiated from the member definition associated with its class
	// template.
	UnresolvedSet<8> TemplateMatches;
	FunctionDecl *NonTemplateMatch = nullptr;
	TemplateSpecCandidateSet FailedCandidates(D.getIdentifierLoc());
	for (LookupResult::iterator P = Previous.begin(), PEnd = Previous.end();
	P != PEnd; ++P) {
	NamedDecl Prev = P;
	if (!HasExplicitTemplateArgs) {
	if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(Prev)) {
	QualType Adjusted = adjustCCAndNoReturn(R, Method->getType(),
	/AdjustExceptionSpec/true);
	if (Context.hasSameUnqualifiedType(Method->getType(), Adjusted)) {
	if (Method->getPrimaryTemplate()) {
	TemplateMatches.addDecl(Method, P.getAccess());
	} else {
	// FIXME: Can this assert ever happen? Needs a test.
	assert(!NonTemplateMatch && "Multiple NonTemplateMatches");
	NonTemplateMatch = Method;
	}
	}
	}
	}

	FunctionTemplateDecl *FunTmpl = dyn_cast<FunctionTemplateDecl>(Prev);
	if (!FunTmpl)
	continue;

	TemplateDeductionInfo Info(FailedCandidates.getLocation());
	FunctionDecl *Specialization = nullptr;
	if (TemplateDeductionResult TDK
	= DeduceTemplateArguments(FunTmpl,
	(HasExplicitTemplateArgs ? &TemplateArgs
	: nullptr),
	R, Specialization, Info)) {
	// Keep track of almost-matches.
	FailedCandidates.addCandidate()
	.set(P.getPair(), FunTmpl->getTemplatedDecl(),
	MakeDeductionFailureInfo(Context, TDK, Info));
	(void)TDK;
	continue;
	}

	// Target attributes are part of the cuda function signature, so
	// the cuda target of the instantiated function must match that of its
	// template. Given that C++ template deduction does not take
	// target attributes into account, we reject candidates here that
	// have a different target.
	if (LangOpts.CUDA &&
	IdentifyCUDATarget(Specialization,
	/* IgnoreImplicitHDAttr = */ true) !=
	IdentifyCUDATarget(D.getDeclSpec().getAttributes())) {
	FailedCandidates.addCandidate().set(
	P.getPair(), FunTmpl->getTemplatedDecl(),
	MakeDeductionFailureInfo(Context, TDK_CUDATargetMismatch, Info));
	continue;
	}

	TemplateMatches.addDecl(Specialization, P.getAccess());
	}

	FunctionDecl *Specialization = NonTemplateMatch;
	if (!Specialization) {
	// Find the most specialized function template specialization.
	UnresolvedSetIterator Result = getMostSpecialized(
	TemplateMatches.begin(), TemplateMatches.end(), FailedCandidates,
	D.getIdentifierLoc(),
	PDiag(diag::err_explicit_instantiation_not_known) << Name,
	PDiag(diag::err_explicit_instantiation_ambiguous) << Name,
	PDiag(diag::note_explicit_instantiation_candidate));

	if (Result == TemplateMatches.end())
	return true;

	// Ignore access control bits, we don't need them for redeclaration checking.
	Specialization = cast<FunctionDecl>(*Result);
	}

	// C++11 [except.spec]p4
	// In an explicit instantiation an exception-specification may be specified,
	// but is not required.
	// If an exception-specification is specified in an explicit instantiation
	// directive, it shall be compatible with the exception-specifications of
	// other declarations of that function.
	if (auto *FPT = R->getAs<FunctionProtoType>())
	if (FPT->hasExceptionSpec()) {
	unsigned DiagID =
	diag::err_mismatched_exception_spec_explicit_instantiation;
	if (getLangOpts().MicrosoftExt)
	DiagID = diag::ext_mismatched_exception_spec_explicit_instantiation;
	bool Result = CheckEquivalentExceptionSpec(
	PDiag(DiagID) << Specialization->getType(),
	PDiag(diag::note_explicit_instantiation_here),
	Specialization->getType()->getAs<FunctionProtoType>(),
	Specialization->getLocation(), FPT, D.getBeginLoc());
	// In Microsoft mode, mismatching exception specifications just cause a
	// warning.
	if (!getLangOpts().MicrosoftExt && Result)
	return true;
	}

	if (Specialization->getTemplateSpecializationKind() == TSK_Undeclared) {
	Diag(D.getIdentifierLoc(),
	diag::err_explicit_instantiation_member_function_not_instantiated)
	<< Specialization
	<< (Specialization->getTemplateSpecializationKind() ==
	TSK_ExplicitSpecialization);
	Diag(Specialization->getLocation(), diag::note_explicit_instantiation_here);
	return true;
	}

	FunctionDecl *PrevDecl = Specialization->getPreviousDecl();
	if (!PrevDecl && Specialization->isThisDeclarationADefinition())
	PrevDecl = Specialization;

	if (PrevDecl) {
	bool HasNoEffect = false;
	if (CheckSpecializationInstantiationRedecl(D.getIdentifierLoc(), TSK,
	PrevDecl,
	PrevDecl->getTemplateSpecializationKind(),
	PrevDecl->getPointOfInstantiation(),
	HasNoEffect))
	return true;

	// FIXME: We may still want to build some representation of this
	// explicit specialization.
	if (HasNoEffect)
	return (Decl*) nullptr;
	}

	// HACK: libc++ has a bug where it attempts to explicitly instantiate the
	// functions
	// valarray<size_t>::valarray(size_t) and
	// valarray<size_t>::~valarray()
	// that it declared to have internal linkage with the internal_linkage
	// attribute. Ignore the explicit instantiation declaration in this case.
	if (Specialization->hasAttr<InternalLinkageAttr>() &&
	TSK == TSK_ExplicitInstantiationDeclaration) {
	if (auto *RD = dyn_cast<CXXRecordDecl>(Specialization->getDeclContext()))
	if (RD->getIdentifier() && RD->getIdentifier()->isStr("valarray") &&
	RD->isInStdNamespace())
	return (Decl*) nullptr;
	}

	ProcessDeclAttributeList(S, Specialization, D.getDeclSpec().getAttributes());

	// In MSVC mode, dllimported explicit instantiation definitions are treated as
	// instantiation declarations.
	if (TSK == TSK_ExplicitInstantiationDefinition &&
	Specialization->hasAttr<DLLImportAttr>() &&
	Context.getTargetInfo().getCXXABI().isMicrosoft())
	TSK = TSK_ExplicitInstantiationDeclaration;

	Specialization->setTemplateSpecializationKind(TSK, D.getIdentifierLoc());

	if (Specialization->isDefined()) {
	// Let the ASTConsumer know that this function has been explicitly
	// instantiated now, and its linkage might have changed.
	Consumer.HandleTopLevelDecl(DeclGroupRef(Specialization));
	} else if (TSK == TSK_ExplicitInstantiationDefinition)
	InstantiateFunctionDefinition(D.getIdentifierLoc(), Specialization);

	// C++0x [temp.explicit]p2:
	// If the explicit instantiation is for a member function, a member class
	// or a static data member of a class template specialization, the name of
	// the class template specialization in the qualified-id for the member
	// name shall be a simple-template-id.
	//
	// C++98 has the same restriction, just worded differently.
	FunctionTemplateDecl *FunTmpl = Specialization->getPrimaryTemplate();
	if (D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId && !FunTmpl &&
	D.getCXXScopeSpec().isSet() &&
	!ScopeSpecifierHasTemplateId(D.getCXXScopeSpec()))
	Diag(D.getIdentifierLoc(),
	diag::ext_explicit_instantiation_without_qualified_id)
	<< Specialization << D.getCXXScopeSpec().getRange();

	CheckExplicitInstantiation(
	*this,
	FunTmpl ? (NamedDecl *)FunTmpl
	: Specialization->getInstantiatedFromMemberFunction(),
	D.getIdentifierLoc(), D.getCXXScopeSpec().isSet(), TSK);

	// FIXME: Create some kind of ExplicitInstantiationDecl here.
	return (Decl*) nullptr;
	}

	TypeResult
	Sema::ActOnDependentTag(Scope *S, unsigned TagSpec, TagUseKind TUK,
	const CXXScopeSpec &SS, IdentifierInfo *Name,
	SourceLocation TagLoc, SourceLocation NameLoc) {
	// This has to hold, because SS is expected to be defined.
	assert(Name && "Expected a name in a dependent tag");

	NestedNameSpecifier *NNS = SS.getScopeRep();
	if (!NNS)
	return true;

	TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);

	if (TUK == TUK_Declaration \|\| TUK == TUK_Definition) {
	Diag(NameLoc, diag::err_dependent_tag_decl)
	<< (TUK == TUK_Definition) << Kind << SS.getRange();
	return true;
	}

	// Create the resulting type.
	ElaboratedTypeKeyword Kwd = TypeWithKeyword::getKeywordForTagTypeKind(Kind);
	QualType Result = Context.getDependentNameType(Kwd, NNS, Name);

	// Create type-source location information for this type.
	TypeLocBuilder TLB;
	DependentNameTypeLoc TL = TLB.push<DependentNameTypeLoc>(Result);
	TL.setElaboratedKeywordLoc(TagLoc);
	TL.setQualifierLoc(SS.getWithLocInContext(Context));
	TL.setNameLoc(NameLoc);
	return CreateParsedType(Result, TLB.getTypeSourceInfo(Context, Result));
	}

	TypeResult
	Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc,
	const CXXScopeSpec &SS, const IdentifierInfo &II,
	SourceLocation IdLoc) {
	if (SS.isInvalid())
	return true;

	if (TypenameLoc.isValid() && S && !S->getTemplateParamParent())
	Diag(TypenameLoc,
	getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_typename_outside_of_template :
	diag::ext_typename_outside_of_template)
	<< FixItHint::CreateRemoval(TypenameLoc);

	NestedNameSpecifierLoc QualifierLoc = SS.getWithLocInContext(Context);
	QualType T = CheckTypenameType(TypenameLoc.isValid()? ETK_Typename : ETK_None,
	TypenameLoc, QualifierLoc, II, IdLoc);
	if (T.isNull())
	return true;

	TypeSourceInfo *TSI = Context.CreateTypeSourceInfo(T);
	if (isa<DependentNameType>(T)) {
	DependentNameTypeLoc TL = TSI->getTypeLoc().castAs<DependentNameTypeLoc>();
	TL.setElaboratedKeywordLoc(TypenameLoc);
	TL.setQualifierLoc(QualifierLoc);
	TL.setNameLoc(IdLoc);
	} else {
	ElaboratedTypeLoc TL = TSI->getTypeLoc().castAs<ElaboratedTypeLoc>();
	TL.setElaboratedKeywordLoc(TypenameLoc);
	TL.setQualifierLoc(QualifierLoc);
	TL.getNamedTypeLoc().castAs<TypeSpecTypeLoc>().setNameLoc(IdLoc);
	}

	return CreateParsedType(T, TSI);
	}

	TypeResult
	Sema::ActOnTypenameType(Scope *S,
	SourceLocation TypenameLoc,
	const CXXScopeSpec &SS,
	SourceLocation TemplateKWLoc,
	TemplateTy TemplateIn,
	IdentifierInfo *TemplateII,
	SourceLocation TemplateIILoc,
	SourceLocation LAngleLoc,
	ASTTemplateArgsPtr TemplateArgsIn,
	SourceLocation RAngleLoc) {
	if (TypenameLoc.isValid() && S && !S->getTemplateParamParent())
	Diag(TypenameLoc,
	getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_typename_outside_of_template :
	diag::ext_typename_outside_of_template)
	<< FixItHint::CreateRemoval(TypenameLoc);

	// Strangely, non-type results are not ignored by this lookup, so the
	// program is ill-formed if it finds an injected-class-name.
	if (TypenameLoc.isValid()) {
	auto *LookupRD =
	dyn_cast_or_null<CXXRecordDecl>(computeDeclContext(SS, false));
	if (LookupRD && LookupRD->getIdentifier() == TemplateII) {
	Diag(TemplateIILoc,
	diag::ext_out_of_line_qualified_id_type_names_constructor)
	<< TemplateII << 0 /injected-class-name used as template name/
	<< (TemplateKWLoc.isValid() ? 1 : 0 /'template'/'typename' keyword/);
	}
	}

	// Translate the parser's template argument list in our AST format.
	TemplateArgumentListInfo TemplateArgs(LAngleLoc, RAngleLoc);
	translateTemplateArguments(TemplateArgsIn, TemplateArgs);

	TemplateName Template = TemplateIn.get();
	if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) {
	// Construct a dependent template specialization type.
	assert(DTN && "dependent template has non-dependent name?");
	assert(DTN->getQualifier() == SS.getScopeRep());
	QualType T = Context.getDependentTemplateSpecializationType(ETK_Typename,
	DTN->getQualifier(),
	DTN->getIdentifier(),
	TemplateArgs);

	// Create source-location information for this type.
	TypeLocBuilder Builder;
	DependentTemplateSpecializationTypeLoc SpecTL
	= Builder.push<DependentTemplateSpecializationTypeLoc>(T);
	SpecTL.setElaboratedKeywordLoc(TypenameLoc);
	SpecTL.setQualifierLoc(SS.getWithLocInContext(Context));
	SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
	SpecTL.setTemplateNameLoc(TemplateIILoc);
	SpecTL.setLAngleLoc(LAngleLoc);
	SpecTL.setRAngleLoc(RAngleLoc);
	for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
	SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());
	return CreateParsedType(T, Builder.getTypeSourceInfo(Context, T));
	}

	QualType T = CheckTemplateIdType(Template, TemplateIILoc, TemplateArgs);
	if (T.isNull())
	return true;

	// Provide source-location information for the template specialization type.
	TypeLocBuilder Builder;
	TemplateSpecializationTypeLoc SpecTL
	= Builder.push<TemplateSpecializationTypeLoc>(T);
	SpecTL.setTemplateKeywordLoc(TemplateKWLoc);
	SpecTL.setTemplateNameLoc(TemplateIILoc);
	SpecTL.setLAngleLoc(LAngleLoc);
	SpecTL.setRAngleLoc(RAngleLoc);
	for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
	SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());

	T = Context.getElaboratedType(ETK_Typename, SS.getScopeRep(), T);
	ElaboratedTypeLoc TL = Builder.push<ElaboratedTypeLoc>(T);
	TL.setElaboratedKeywordLoc(TypenameLoc);
	TL.setQualifierLoc(SS.getWithLocInContext(Context));

	TypeSourceInfo *TSI = Builder.getTypeSourceInfo(Context, T);
	return CreateParsedType(T, TSI);
	}


	/// Determine whether this failed name lookup should be treated as being
	/// disabled by a usage of std::enable_if.
	static bool isEnableIf(NestedNameSpecifierLoc NNS, const IdentifierInfo &II,
	SourceRange &CondRange, Expr *&Cond) {
	// We must be looking for a ::type...
	if (!II.isStr("type"))
	return false;

	// ... within an explicitly-written template specialization...
	if (!NNS \|\| !NNS.getNestedNameSpecifier()->getAsType())
	return false;
	TypeLoc EnableIfTy = NNS.getTypeLoc();
	TemplateSpecializationTypeLoc EnableIfTSTLoc =
	EnableIfTy.getAs<TemplateSpecializationTypeLoc>();
	if (!EnableIfTSTLoc \|\| EnableIfTSTLoc.getNumArgs() == 0)
	return false;
	const TemplateSpecializationType *EnableIfTST = EnableIfTSTLoc.getTypePtr();

	// ... which names a complete class template declaration...
	const TemplateDecl *EnableIfDecl =
	EnableIfTST->getTemplateName().getAsTemplateDecl();
	if (!EnableIfDecl \|\| EnableIfTST->isIncompleteType())
	return false;

	// ... called "enable_if".
	const IdentifierInfo *EnableIfII =
	EnableIfDecl->getDeclName().getAsIdentifierInfo();
	if (!EnableIfII \|\| !EnableIfII->isStr("enable_if"))
	return false;

	// Assume the first template argument is the condition.
	CondRange = EnableIfTSTLoc.getArgLoc(0).getSourceRange();

	// Dig out the condition.
	Cond = nullptr;
	if (EnableIfTSTLoc.getArgLoc(0).getArgument().getKind()
	!= TemplateArgument::Expression)
	return true;

	Cond = EnableIfTSTLoc.getArgLoc(0).getSourceExpression();

	// Ignore Boolean literals; they add no value.
	if (isa<CXXBoolLiteralExpr>(Cond->IgnoreParenCasts()))
	Cond = nullptr;

	return true;
	}

	/// Build the type that describes a C++ typename specifier,
	/// e.g., "typename T::type".
	QualType
	Sema::CheckTypenameType(ElaboratedTypeKeyword Keyword,
	SourceLocation KeywordLoc,
	NestedNameSpecifierLoc QualifierLoc,
	const IdentifierInfo &II,
	SourceLocation IILoc) {
	CXXScopeSpec SS;
	SS.Adopt(QualifierLoc);

	DeclContext *Ctx = computeDeclContext(SS);
	if (!Ctx) {
	// If the nested-name-specifier is dependent and couldn't be
	// resolved to a type, build a typename type.
	assert(QualifierLoc.getNestedNameSpecifier()->isDependent());
	return Context.getDependentNameType(Keyword,
	QualifierLoc.getNestedNameSpecifier(),
	&II);
	}

	// If the nested-name-specifier refers to the current instantiation,
	// the "typename" keyword itself is superfluous. In C++03, the
	// program is actually ill-formed. However, DR 382 (in C++0x CD1)
	// allows such extraneous "typename" keywords, and we retroactively
	// apply this DR to C++03 code with only a warning. In any case we continue.

	if (RequireCompleteDeclContext(SS, Ctx))
	return QualType();

	DeclarationName Name(&II);
	LookupResult Result(*this, Name, IILoc, LookupOrdinaryName);
	LookupQualifiedName(Result, Ctx, SS);
	unsigned DiagID = 0;
	Decl *Referenced = nullptr;
	switch (Result.getResultKind()) {
	case LookupResult::NotFound: {
	// If we're looking up 'type' within a template named 'enable_if', produce
	// a more specific diagnostic.
	SourceRange CondRange;
	Expr *Cond = nullptr;
	if (isEnableIf(QualifierLoc, II, CondRange, Cond)) {
	// If we have a condition, narrow it down to the specific failed
	// condition.
	if (Cond) {
	Expr *FailedCond;
	std::string FailedDescription;
	std::tie(FailedCond, FailedDescription) =
	findFailedBooleanCondition(Cond);

	Diag(FailedCond->getExprLoc(),
	diag::err_typename_nested_not_found_requirement)
	<< FailedDescription
	<< FailedCond->getSourceRange();
	return QualType();
	}

	Diag(CondRange.getBegin(), diag::err_typename_nested_not_found_enable_if)
	<< Ctx << CondRange;
	return QualType();
	}

	DiagID = diag::err_typename_nested_not_found;
	break;
	}

	case LookupResult::FoundUnresolvedValue: {
	// We found a using declaration that is a value. Most likely, the using
	// declaration itself is meant to have the 'typename' keyword.
	SourceRange FullRange(KeywordLoc.isValid() ? KeywordLoc : SS.getBeginLoc(),
	IILoc);
	Diag(IILoc, diag::err_typename_refers_to_using_value_decl)
	<< Name << Ctx << FullRange;
	if (UnresolvedUsingValueDecl *Using
	= dyn_cast<UnresolvedUsingValueDecl>(Result.getRepresentativeDecl())){
	SourceLocation Loc = Using->getQualifierLoc().getBeginLoc();
	Diag(Loc, diag::note_using_value_decl_missing_typename)
	<< FixItHint::CreateInsertion(Loc, "typename ");
	}
	}
	// Fall through to create a dependent typename type, from which we can recover
	// better.
	LLVM_FALLTHROUGH;

	case LookupResult::NotFoundInCurrentInstantiation:
	// Okay, it's a member of an unknown instantiation.
	return Context.getDependentNameType(Keyword,
	QualifierLoc.getNestedNameSpecifier(),
	&II);

	case LookupResult::Found:
	if (TypeDecl *Type = dyn_cast<TypeDecl>(Result.getFoundDecl())) {
	// C++ [class.qual]p2:
	// In a lookup in which function names are not ignored and the
	// nested-name-specifier nominates a class C, if the name specified
	// after the nested-name-specifier, when looked up in C, is the
	// injected-class-name of C [...] then the name is instead considered
	// to name the constructor of class C.
	//
	// Unlike in an elaborated-type-specifier, function names are not ignored
	// in typename-specifier lookup. However, they are ignored in all the
	// contexts where we form a typename type with no keyword (that is, in
	// mem-initializer-ids, base-specifiers, and elaborated-type-specifiers).
	//
	// FIXME: That's not strictly true: mem-initializer-id lookup does not
	// ignore functions, but that appears to be an oversight.
	auto *LookupRD = dyn_cast_or_null<CXXRecordDecl>(Ctx);
	auto *FoundRD = dyn_cast<CXXRecordDecl>(Type);
	if (Keyword == ETK_Typename && LookupRD && FoundRD &&
	FoundRD->isInjectedClassName() &&
	declaresSameEntity(LookupRD, cast<Decl>(FoundRD->getParent())))
	Diag(IILoc, diag::ext_out_of_line_qualified_id_type_names_constructor)
	<< &II << 1 << 0 /'typename' keyword used/;

	// We found a type. Build an ElaboratedType, since the
	// typename-specifier was just sugar.
	MarkAnyDeclReferenced(Type->getLocation(), Type, /OdrUse=/false);
	return Context.getElaboratedType(Keyword,
	QualifierLoc.getNestedNameSpecifier(),
	Context.getTypeDeclType(Type));
	}

	// C++ [dcl.type.simple]p2:
	// A type-specifier of the form
	// typename[opt] nested-name-specifier[opt] template-name
	// is a placeholder for a deduced class type [...].
	if (getLangOpts().CPlusPlus17) {
	if (auto *TD = getAsTypeTemplateDecl(Result.getFoundDecl())) {
	return Context.getElaboratedType(
	Keyword, QualifierLoc.getNestedNameSpecifier(),
	Context.getDeducedTemplateSpecializationType(TemplateName(TD),
	QualType(), false));
	}
	}

	DiagID = diag::err_typename_nested_not_type;
	Referenced = Result.getFoundDecl();
	break;

	case LookupResult::FoundOverloaded:
	DiagID = diag::err_typename_nested_not_type;
	Referenced = *Result.begin();
	break;

	case LookupResult::Ambiguous:
	return QualType();
	}

	// If we get here, it's because name lookup did not find a
	// type. Emit an appropriate diagnostic and return an error.
	SourceRange FullRange(KeywordLoc.isValid() ? KeywordLoc : SS.getBeginLoc(),
	IILoc);
	Diag(IILoc, DiagID) << FullRange << Name << Ctx;
	if (Referenced)
	Diag(Referenced->getLocation(), diag::note_typename_refers_here)
	<< Name;
	return QualType();
	}

	namespace {
	// See Sema::RebuildTypeInCurrentInstantiation
	class CurrentInstantiationRebuilder
	: public TreeTransform<CurrentInstantiationRebuilder> {
	SourceLocation Loc;
	DeclarationName Entity;

	public:
	typedef TreeTransform<CurrentInstantiationRebuilder> inherited;

	CurrentInstantiationRebuilder(Sema &SemaRef,
	SourceLocation Loc,
	DeclarationName Entity)
	: TreeTransform<CurrentInstantiationRebuilder>(SemaRef),
	Loc(Loc), Entity(Entity) { }

	/// Determine whether the given type \p T has already been
	/// transformed.
	///
	/// For the purposes of type reconstruction, a type has already been
	/// transformed if it is NULL or if it is not dependent.
	bool AlreadyTransformed(QualType T) {
	return T.isNull() \|\| !T->isDependentType();
	}

	/// Returns the location of the entity whose type is being
	/// rebuilt.
	SourceLocation getBaseLocation() { return Loc; }

	/// Returns the name of the entity whose type is being rebuilt.
	DeclarationName getBaseEntity() { return Entity; }

	/// Sets the "base" location and entity when that
	/// information is known based on another transformation.
	void setBase(SourceLocation Loc, DeclarationName Entity) {
	this->Loc = Loc;
	this->Entity = Entity;
	}

	ExprResult TransformLambdaExpr(LambdaExpr *E) {
	// Lambdas never need to be transformed.
	return E;
	}
	};
	} // end anonymous namespace

	/// Rebuilds a type within the context of the current instantiation.
	///
	/// The type \p T is part of the type of an out-of-line member definition of
	/// a class template (or class template partial specialization) that was parsed
	/// and constructed before we entered the scope of the class template (or
	/// partial specialization thereof). This routine will rebuild that type now
	/// that we have entered the declarator's scope, which may produce different
	/// canonical types, e.g.,
	///
	/// \code
	/// template<typename T>
	/// struct X {
	/// typedef T* pointer;
	/// pointer data();
	/// };
	///
	/// template<typename T>
	/// typename X<T>::pointer X<T>::data() { ... }
	/// \endcode
	///
	/// Here, the type "typename X<T>::pointer" will be created as a DependentNameType,
	/// since we do not know that we can look into X<T> when we parsed the type.
	/// This function will rebuild the type, performing the lookup of "pointer"
	/// in X<T> and returning an ElaboratedType whose canonical type is the same
	/// as the canonical type of T*, allowing the return types of the out-of-line
	/// definition and the declaration to match.
	TypeSourceInfo Sema::RebuildTypeInCurrentInstantiation(TypeSourceInfo T,
	SourceLocation Loc,
	DeclarationName Name) {
	if (!T \|\| !T->getType()->isDependentType())
	return T;

	CurrentInstantiationRebuilder Rebuilder(*this, Loc, Name);
	return Rebuilder.TransformType(T);
	}

	ExprResult Sema::RebuildExprInCurrentInstantiation(Expr *E) {
	CurrentInstantiationRebuilder Rebuilder(*this, E->getExprLoc(),
	DeclarationName());
	return Rebuilder.TransformExpr(E);
	}

	bool Sema::RebuildNestedNameSpecifierInCurrentInstantiation(CXXScopeSpec &SS) {
	if (SS.isInvalid())
	return true;

	NestedNameSpecifierLoc QualifierLoc = SS.getWithLocInContext(Context);
	CurrentInstantiationRebuilder Rebuilder(*this, SS.getRange().getBegin(),
	DeclarationName());
	NestedNameSpecifierLoc Rebuilt
	= Rebuilder.TransformNestedNameSpecifierLoc(QualifierLoc);
	if (!Rebuilt)
	return true;

	SS.Adopt(Rebuilt);
	return false;
	}

	/// Rebuild the template parameters now that we know we're in a current
	/// instantiation.
	bool Sema::RebuildTemplateParamsInCurrentInstantiation(
	TemplateParameterList *Params) {
	for (unsigned I = 0, N = Params->size(); I != N; ++I) {
	Decl *Param = Params->getParam(I);

	// There is nothing to rebuild in a type parameter.
	if (isa<TemplateTypeParmDecl>(Param))
	continue;

	// Rebuild the template parameter list of a template template parameter.
	if (TemplateTemplateParmDecl *TTP
	= dyn_cast<TemplateTemplateParmDecl>(Param)) {
	if (RebuildTemplateParamsInCurrentInstantiation(
	TTP->getTemplateParameters()))
	return true;

	continue;
	}

	// Rebuild the type of a non-type template parameter.
	NonTypeTemplateParmDecl *NTTP = cast<NonTypeTemplateParmDecl>(Param);
	TypeSourceInfo *NewTSI
	= RebuildTypeInCurrentInstantiation(NTTP->getTypeSourceInfo(),
	NTTP->getLocation(),
	NTTP->getDeclName());
	if (!NewTSI)
	return true;

	if (NewTSI->getType()->isUndeducedType()) {
	// C++17 [temp.dep.expr]p3:
	// An id-expression is type-dependent if it contains
	// - an identifier associated by name lookup with a non-type
	// template-parameter declared with a type that contains a
	// placeholder type (7.1.7.4),
	NewTSI = SubstAutoTypeSourceInfo(NewTSI, Context.DependentTy);
	}

	if (NewTSI != NTTP->getTypeSourceInfo()) {
	NTTP->setTypeSourceInfo(NewTSI);
	NTTP->setType(NewTSI->getType());
	}
	}

	return false;
	}

	/// Produces a formatted string that describes the binding of
	/// template parameters to template arguments.
	std::string
	Sema::getTemplateArgumentBindingsText(const TemplateParameterList *Params,
	const TemplateArgumentList &Args) {
	return getTemplateArgumentBindingsText(Params, Args.data(), Args.size());
	}

	std::string
	Sema::getTemplateArgumentBindingsText(const TemplateParameterList *Params,
	const TemplateArgument *Args,
	unsigned NumArgs) {
	SmallString<128> Str;
	llvm::raw_svector_ostream Out(Str);

	if (!Params \|\| Params->size() == 0 \|\| NumArgs == 0)
	return std::string();

	for (unsigned I = 0, N = Params->size(); I != N; ++I) {
	if (I >= NumArgs)
	break;

	if (I == 0)
	Out << "[with ";
	else
	Out << ", ";

	if (const IdentifierInfo *Id = Params->getParam(I)->getIdentifier()) {
	Out << Id->getName();
	} else {
	Out << '$' << I;
	}

	Out << " = ";
	Args[I].print(getPrintingPolicy(), Out);
	}

	Out << ']';
	return Out.str();
	}

	void Sema::MarkAsLateParsedTemplate(FunctionDecl FD, Decl FnD,
	CachedTokens &Toks) {
	if (!FD)
	return;

	auto LPT = llvm::make_unique<LateParsedTemplate>();

	// Take tokens to avoid allocations
	LPT->Toks.swap(Toks);
	LPT->D = FnD;
	LateParsedTemplateMap.insert(std::make_pair(FD, std::move(LPT)));

	FD->setLateTemplateParsed(true);
	}

	void Sema::UnmarkAsLateParsedTemplate(FunctionDecl *FD) {
	if (!FD)
	return;
	FD->setLateTemplateParsed(false);
	}

	bool Sema::IsInsideALocalClassWithinATemplateFunction() {
	DeclContext *DC = CurContext;

	while (DC) {
	if (CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(CurContext)) {
	const FunctionDecl *FD = RD->isLocalClass();
	return (FD && FD->getTemplatedKind() != FunctionDecl::TK_NonTemplate);
	} else if (DC->isTranslationUnit() \|\| DC->isNamespace())
	return false;

	DC = DC->getParent();
	}
	return false;
	}

	namespace {
	/// Walk the path from which a declaration was instantiated, and check
	/// that every explicit specialization along that path is visible. This enforces
	/// C++ [temp.expl.spec]/6:
	///
	/// If a template, a member template or a member of a class template is
	/// explicitly specialized then that specialization shall be declared before
	/// the first use of that specialization that would cause an implicit
	/// instantiation to take place, in every translation unit in which such a
	/// use occurs; no diagnostic is required.
	///
	/// and also C++ [temp.class.spec]/1:
	///
	/// A partial specialization shall be declared before the first use of a
	/// class template specialization that would make use of the partial
	/// specialization as the result of an implicit or explicit instantiation
	/// in every translation unit in which such a use occurs; no diagnostic is
	/// required.
	class ExplicitSpecializationVisibilityChecker {
	Sema &S;
	SourceLocation Loc;
	llvm::SmallVector<Module *, 8> Modules;

	public:
	ExplicitSpecializationVisibilityChecker(Sema &S, SourceLocation Loc)
	: S(S), Loc(Loc) {}

	void check(NamedDecl *ND) {
	if (auto *FD = dyn_cast<FunctionDecl>(ND))
	return checkImpl(FD);
	if (auto *RD = dyn_cast<CXXRecordDecl>(ND))
	return checkImpl(RD);
	if (auto *VD = dyn_cast<VarDecl>(ND))
	return checkImpl(VD);
	if (auto *ED = dyn_cast<EnumDecl>(ND))
	return checkImpl(ED);
	}

	private:
	void diagnose(NamedDecl *D, bool IsPartialSpec) {
	auto Kind = IsPartialSpec ? Sema::MissingImportKind::PartialSpecialization
	: Sema::MissingImportKind::ExplicitSpecialization;
	const bool Recover = true;

	// If we got a custom set of modules (because only a subset of the
	// declarations are interesting), use them, otherwise let
	// diagnoseMissingImport intelligently pick some.
	if (Modules.empty())
	S.diagnoseMissingImport(Loc, D, Kind, Recover);
	else
	S.diagnoseMissingImport(Loc, D, D->getLocation(), Modules, Kind, Recover);
	}

	// Check a specific declaration. There are three problematic cases:
	//
	// 1) The declaration is an explicit specialization of a template
	// specialization.
	// 2) The declaration is an explicit specialization of a member of an
	// templated class.
	// 3) The declaration is an instantiation of a template, and that template
	// is an explicit specialization of a member of a templated class.
	//
	// We don't need to go any deeper than that, as the instantiation of the
	// surrounding class / etc is not triggered by whatever triggered this
	// instantiation, and thus should be checked elsewhere.
	template<typename SpecDecl>
	void checkImpl(SpecDecl *Spec) {
	bool IsHiddenExplicitSpecialization = false;
	if (Spec->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) {
	IsHiddenExplicitSpecialization =
	Spec->getMemberSpecializationInfo()
	? !S.hasVisibleMemberSpecialization(Spec, &Modules)
	: !S.hasVisibleExplicitSpecialization(Spec, &Modules);
	} else {
	checkInstantiated(Spec);
	}

	if (IsHiddenExplicitSpecialization)
	diagnose(Spec->getMostRecentDecl(), false);
	}

	void checkInstantiated(FunctionDecl *FD) {
	if (auto *TD = FD->getPrimaryTemplate())
	checkTemplate(TD);
	}

	void checkInstantiated(CXXRecordDecl *RD) {
	auto *SD = dyn_cast<ClassTemplateSpecializationDecl>(RD);
	if (!SD)
	return;

	auto From = SD->getSpecializedTemplateOrPartial();
	if (auto TD = From.dyn_cast<ClassTemplateDecl >())
	checkTemplate(TD);
	else if (auto *TD =
	From.dyn_cast<ClassTemplatePartialSpecializationDecl *>()) {
	if (!S.hasVisibleDeclaration(TD))
	diagnose(TD, true);
	checkTemplate(TD);
	}
	}

	void checkInstantiated(VarDecl *RD) {
	auto *SD = dyn_cast<VarTemplateSpecializationDecl>(RD);
	if (!SD)
	return;

	auto From = SD->getSpecializedTemplateOrPartial();
	if (auto TD = From.dyn_cast<VarTemplateDecl >())
	checkTemplate(TD);
	else if (auto *TD =
	From.dyn_cast<VarTemplatePartialSpecializationDecl *>()) {
	if (!S.hasVisibleDeclaration(TD))
	diagnose(TD, true);
	checkTemplate(TD);
	}
	}

	void checkInstantiated(EnumDecl *FD) {}

	template<typename TemplDecl>
	void checkTemplate(TemplDecl *TD) {
	if (TD->isMemberSpecialization()) {
	if (!S.hasVisibleMemberSpecialization(TD, &Modules))
	diagnose(TD->getMostRecentDecl(), false);
	}
	}
	};
	} // end anonymous namespace

	void Sema::checkSpecializationVisibility(SourceLocation Loc, NamedDecl *Spec) {
	if (!getLangOpts().Modules)
	return;

	ExplicitSpecializationVisibilityChecker(*this, Loc).check(Spec);
	}

	/// Check whether a template partial specialization that we've discovered
	/// is hidden, and produce suitable diagnostics if so.
	void Sema::checkPartialSpecializationVisibility(SourceLocation Loc,
	NamedDecl *Spec) {
	llvm::SmallVector<Module *, 8> Modules;
	if (!hasVisibleDeclaration(Spec, &Modules))
	diagnoseMissingImport(Loc, Spec, Spec->getLocation(), Modules,
	MissingImportKind::PartialSpecialization,
	/Recover/true);
	}
	Index: projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaType.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaType.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/clang/lib/Sema/SemaType.cpp (revision 351722)
	@@ -1,8433 +1,8447 @@
	//===--- SemaType.cpp - Semantic Analysis for Types -----------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements type-related semantic analysis.
	//
	//===----------------------------------------------------------------------===//

	#include "TypeLocBuilder.h"
	#include "clang/AST/ASTConsumer.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/ASTMutationListener.h"
	#include "clang/AST/ASTStructuralEquivalence.h"
	#include "clang/AST/CXXInheritance.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/TypeLoc.h"
	#include "clang/AST/TypeLocVisitor.h"
	#include "clang/Basic/PartialDiagnostic.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Lex/Preprocessor.h"
	#include "clang/Sema/DeclSpec.h"
	#include "clang/Sema/DelayedDiagnostic.h"
	#include "clang/Sema/Lookup.h"
	#include "clang/Sema/ScopeInfo.h"
	#include "clang/Sema/SemaInternal.h"
	#include "clang/Sema/Template.h"
	#include "clang/Sema/TemplateInstCallback.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/ErrorHandling.h"

	using namespace clang;

	enum TypeDiagSelector {
	TDS_Function,
	TDS_Pointer,
	TDS_ObjCObjOrBlock
	};

	/// isOmittedBlockReturnType - Return true if this declarator is missing a
	/// return type because this is a omitted return type on a block literal.
	static bool isOmittedBlockReturnType(const Declarator &D) {
	if (D.getContext() != DeclaratorContext::BlockLiteralContext \|\|
	D.getDeclSpec().hasTypeSpecifier())
	return false;

	if (D.getNumTypeObjects() == 0)
	return true; // ^{ ... }

	if (D.getNumTypeObjects() == 1 &&
	D.getTypeObject(0).Kind == DeclaratorChunk::Function)
	return true; // ^(int X, float Y) { ... }

	return false;
	}

	/// diagnoseBadTypeAttribute - Diagnoses a type attribute which
	/// doesn't apply to the given type.
	static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
	QualType type) {
	TypeDiagSelector WhichType;
	bool useExpansionLoc = true;
	switch (attr.getKind()) {
	case ParsedAttr::AT_ObjCGC:
	WhichType = TDS_Pointer;
	break;
	case ParsedAttr::AT_ObjCOwnership:
	WhichType = TDS_ObjCObjOrBlock;
	break;
	default:
	// Assume everything else was a function attribute.
	WhichType = TDS_Function;
	useExpansionLoc = false;
	break;
	}

	SourceLocation loc = attr.getLoc();
	StringRef name = attr.getName()->getName();

	// The GC attributes are usually written with macros; special-case them.
	IdentifierInfo *II = attr.isArgIdent(0) ? attr.getArgAsIdent(0)->Ident
	: nullptr;
	if (useExpansionLoc && loc.isMacroID() && II) {
	if (II->isStr("strong")) {
	if (S.findMacroSpelling(loc, "__strong")) name = "__strong";
	} else if (II->isStr("weak")) {
	if (S.findMacroSpelling(loc, "__weak")) name = "__weak";
	}
	}

	S.Diag(loc, diag::warn_type_attribute_wrong_type) << name << WhichType
	<< type;
	}

	// objc_gc applies to Objective-C pointers or, otherwise, to the
	// smallest available pointer type (i.e. 'void' in 'void*').
	#define OBJC_POINTER_TYPE_ATTRS_CASELIST \
	case ParsedAttr::AT_ObjCGC: \
	case ParsedAttr::AT_ObjCOwnership

	// Calling convention attributes.
	#define CALLING_CONV_ATTRS_CASELIST \
	case ParsedAttr::AT_CDecl: \
	case ParsedAttr::AT_FastCall: \
	case ParsedAttr::AT_StdCall: \
	case ParsedAttr::AT_ThisCall: \
	case ParsedAttr::AT_RegCall: \
	case ParsedAttr::AT_Pascal: \
	case ParsedAttr::AT_SwiftCall: \
	case ParsedAttr::AT_VectorCall: \
	case ParsedAttr::AT_AArch64VectorPcs: \
	case ParsedAttr::AT_MSABI: \
	case ParsedAttr::AT_SysVABI: \
	case ParsedAttr::AT_Pcs: \
	case ParsedAttr::AT_IntelOclBicc: \
	case ParsedAttr::AT_PreserveMost: \
	case ParsedAttr::AT_PreserveAll

	// Function type attributes.
	#define FUNCTION_TYPE_ATTRS_CASELIST \
	case ParsedAttr::AT_NSReturnsRetained: \
	case ParsedAttr::AT_NoReturn: \
	case ParsedAttr::AT_Regparm: \
	case ParsedAttr::AT_AnyX86NoCallerSavedRegisters: \
	case ParsedAttr::AT_AnyX86NoCfCheck: \
	CALLING_CONV_ATTRS_CASELIST

	// Microsoft-specific type qualifiers.
	#define MS_TYPE_ATTRS_CASELIST \
	case ParsedAttr::AT_Ptr32: \
	case ParsedAttr::AT_Ptr64: \
	case ParsedAttr::AT_SPtr: \
	case ParsedAttr::AT_UPtr

	// Nullability qualifiers.
	#define NULLABILITY_TYPE_ATTRS_CASELIST \
	case ParsedAttr::AT_TypeNonNull: \
	case ParsedAttr::AT_TypeNullable: \
	case ParsedAttr::AT_TypeNullUnspecified

	namespace {
	/// An object which stores processing state for the entire
	/// GetTypeForDeclarator process.
	class TypeProcessingState {
	Sema &sema;

	/// The declarator being processed.
	Declarator &declarator;

	/// The index of the declarator chunk we're currently processing.
	/// May be the total number of valid chunks, indicating the
	/// DeclSpec.
	unsigned chunkIndex;

	/// Whether there are non-trivial modifications to the decl spec.
	bool trivial;

	/// Whether we saved the attributes in the decl spec.
	bool hasSavedAttrs;

	/// The original set of attributes on the DeclSpec.
	SmallVector<ParsedAttr *, 2> savedAttrs;

	/// A list of attributes to diagnose the uselessness of when the
	/// processing is complete.
	SmallVector<ParsedAttr *, 2> ignoredTypeAttrs;

	/// Attributes corresponding to AttributedTypeLocs that we have not yet
	/// populated.
	// FIXME: The two-phase mechanism by which we construct Types and fill
	// their TypeLocs makes it hard to correctly assign these. We keep the
	// attributes in creation order as an attempt to make them line up
	// properly.
	using TypeAttrPair = std::pair<const AttributedType, const Attr>;
	SmallVector<TypeAttrPair, 8> AttrsForTypes;
	bool AttrsForTypesSorted = true;

	/// MacroQualifiedTypes mapping to macro expansion locations that will be
	/// stored in a MacroQualifiedTypeLoc.
	llvm::DenseMap<const MacroQualifiedType *, SourceLocation> LocsForMacros;

	/// Flag to indicate we parsed a noderef attribute. This is used for
	/// validating that noderef was used on a pointer or array.
	bool parsedNoDeref;

	public:
	TypeProcessingState(Sema &sema, Declarator &declarator)
	: sema(sema), declarator(declarator),
	chunkIndex(declarator.getNumTypeObjects()), trivial(true),
	hasSavedAttrs(false), parsedNoDeref(false) {}

	Sema &getSema() const {
	return sema;
	}

	Declarator &getDeclarator() const {
	return declarator;
	}

	bool isProcessingDeclSpec() const {
	return chunkIndex == declarator.getNumTypeObjects();
	}

	unsigned getCurrentChunkIndex() const {
	return chunkIndex;
	}

	void setCurrentChunkIndex(unsigned idx) {
	assert(idx <= declarator.getNumTypeObjects());
	chunkIndex = idx;
	}

	ParsedAttributesView &getCurrentAttributes() const {
	if (isProcessingDeclSpec())
	return getMutableDeclSpec().getAttributes();
	return declarator.getTypeObject(chunkIndex).getAttrs();
	}

	/// Save the current set of attributes on the DeclSpec.
	void saveDeclSpecAttrs() {
	// Don't try to save them multiple times.
	if (hasSavedAttrs) return;

	DeclSpec &spec = getMutableDeclSpec();
	for (ParsedAttr &AL : spec.getAttributes())
	savedAttrs.push_back(&AL);
	trivial &= savedAttrs.empty();
	hasSavedAttrs = true;
	}

	/// Record that we had nowhere to put the given type attribute.
	/// We will diagnose such attributes later.
	void addIgnoredTypeAttr(ParsedAttr &attr) {
	ignoredTypeAttrs.push_back(&attr);
	}

	/// Diagnose all the ignored type attributes, given that the
	/// declarator worked out to the given type.
	void diagnoseIgnoredTypeAttrs(QualType type) const {
	for (auto *Attr : ignoredTypeAttrs)
	diagnoseBadTypeAttribute(getSema(), *Attr, type);
	}

	/// Get an attributed type for the given attribute, and remember the Attr
	/// object so that we can attach it to the AttributedTypeLoc.
	QualType getAttributedType(Attr *A, QualType ModifiedType,
	QualType EquivType) {
	QualType T =
	sema.Context.getAttributedType(A->getKind(), ModifiedType, EquivType);
	AttrsForTypes.push_back({cast<AttributedType>(T.getTypePtr()), A});
	AttrsForTypesSorted = false;
	return T;
	}

	/// Completely replace the \c auto in \p TypeWithAuto by
	/// \p Replacement. Also replace \p TypeWithAuto in \c TypeAttrPair if
	/// necessary.
	QualType ReplaceAutoType(QualType TypeWithAuto, QualType Replacement) {
	QualType T = sema.ReplaceAutoType(TypeWithAuto, Replacement);
	if (auto *AttrTy = TypeWithAuto->getAs<AttributedType>()) {
	// Attributed type still should be an attributed type after replacement.
	auto *NewAttrTy = cast<AttributedType>(T.getTypePtr());
	for (TypeAttrPair &A : AttrsForTypes) {
	if (A.first == AttrTy)
	A.first = NewAttrTy;
	}
	AttrsForTypesSorted = false;
	}
	return T;
	}

	/// Extract and remove the Attr* for a given attributed type.
	const Attr takeAttrForAttributedType(const AttributedType AT) {
	if (!AttrsForTypesSorted) {
	llvm::stable_sort(AttrsForTypes, llvm::less_first());
	AttrsForTypesSorted = true;
	}

	// FIXME: This is quadratic if we have lots of reuses of the same
	// attributed type.
	for (auto It = std::partition_point(
	AttrsForTypes.begin(), AttrsForTypes.end(),
	[=](const TypeAttrPair &A) { return A.first < AT; });
	It != AttrsForTypes.end() && It->first == AT; ++It) {
	if (It->second) {
	const Attr *Result = It->second;
	It->second = nullptr;
	return Result;
	}
	}

	llvm_unreachable("no Attr* for AttributedType*");
	}

	SourceLocation
	getExpansionLocForMacroQualifiedType(const MacroQualifiedType *MQT) const {
	auto FoundLoc = LocsForMacros.find(MQT);
	assert(FoundLoc != LocsForMacros.end() &&
	"Unable to find macro expansion location for MacroQualifedType");
	return FoundLoc->second;
	}

	void setExpansionLocForMacroQualifiedType(const MacroQualifiedType *MQT,
	SourceLocation Loc) {
	LocsForMacros[MQT] = Loc;
	}

	void setParsedNoDeref(bool parsed) { parsedNoDeref = parsed; }

	bool didParseNoDeref() const { return parsedNoDeref; }

	~TypeProcessingState() {
	if (trivial) return;

	restoreDeclSpecAttrs();
	}

	private:
	DeclSpec &getMutableDeclSpec() const {
	return const_cast<DeclSpec&>(declarator.getDeclSpec());
	}

	void restoreDeclSpecAttrs() {
	assert(hasSavedAttrs);

	getMutableDeclSpec().getAttributes().clearListOnly();
	for (ParsedAttr *AL : savedAttrs)
	getMutableDeclSpec().getAttributes().addAtEnd(AL);
	}
	};
	} // end anonymous namespace

	static void moveAttrFromListToList(ParsedAttr &attr,
	ParsedAttributesView &fromList,
	ParsedAttributesView &toList) {
	fromList.remove(&attr);
	toList.addAtEnd(&attr);
	}

	/// The location of a type attribute.
	enum TypeAttrLocation {
	/// The attribute is in the decl-specifier-seq.
	TAL_DeclSpec,
	/// The attribute is part of a DeclaratorChunk.
	TAL_DeclChunk,
	/// The attribute is immediately after the declaration's name.
	TAL_DeclName
	};

	static void processTypeAttrs(TypeProcessingState &state, QualType &type,
	TypeAttrLocation TAL, ParsedAttributesView &attrs);

	static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
	QualType &type);

	static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &state,
	ParsedAttr &attr, QualType &type);

	static bool handleObjCGCTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
	QualType &type);

	static bool handleObjCOwnershipTypeAttr(TypeProcessingState &state,
	ParsedAttr &attr, QualType &type);

	static bool handleObjCPointerTypeAttr(TypeProcessingState &state,
	ParsedAttr &attr, QualType &type) {
	if (attr.getKind() == ParsedAttr::AT_ObjCGC)
	return handleObjCGCTypeAttr(state, attr, type);
	assert(attr.getKind() == ParsedAttr::AT_ObjCOwnership);
	return handleObjCOwnershipTypeAttr(state, attr, type);
	}

	/// Given the index of a declarator chunk, check whether that chunk
	/// directly specifies the return type of a function and, if so, find
	/// an appropriate place for it.
	///
	/// \param i - a notional index which the search will start
	/// immediately inside
	///
	/// \param onlyBlockPointers Whether we should only look into block
	/// pointer types (vs. all pointer types).
	static DeclaratorChunk *maybeMovePastReturnType(Declarator &declarator,
	unsigned i,
	bool onlyBlockPointers) {
	assert(i <= declarator.getNumTypeObjects());

	DeclaratorChunk *result = nullptr;

	// First, look inwards past parens for a function declarator.
	for (; i != 0; --i) {
	DeclaratorChunk &fnChunk = declarator.getTypeObject(i-1);
	switch (fnChunk.Kind) {
	case DeclaratorChunk::Paren:
	continue;

	// If we find anything except a function, bail out.
	case DeclaratorChunk::Pointer:
	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::Array:
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pipe:
	return result;

	// If we do find a function declarator, scan inwards from that,
	// looking for a (block-)pointer declarator.
	case DeclaratorChunk::Function:
	for (--i; i != 0; --i) {
	DeclaratorChunk &ptrChunk = declarator.getTypeObject(i-1);
	switch (ptrChunk.Kind) {
	case DeclaratorChunk::Paren:
	case DeclaratorChunk::Array:
	case DeclaratorChunk::Function:
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::Pipe:
	continue;

	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pointer:
	if (onlyBlockPointers)
	continue;

	LLVM_FALLTHROUGH;

	case DeclaratorChunk::BlockPointer:
	result = &ptrChunk;
	goto continue_outer;
	}
	llvm_unreachable("bad declarator chunk kind");
	}

	// If we run out of declarators doing that, we're done.
	return result;
	}
	llvm_unreachable("bad declarator chunk kind");

	// Okay, reconsider from our new point.
	continue_outer: ;
	}

	// Ran out of chunks, bail out.
	return result;
	}

	/// Given that an objc_gc attribute was written somewhere on a
	/// declaration other than on the declarator itself (for which, use
	/// distributeObjCPointerTypeAttrFromDeclarator), and given that it
	/// didn't apply in whatever position it was written in, try to move
	/// it to a more appropriate position.
	static void distributeObjCPointerTypeAttr(TypeProcessingState &state,
	ParsedAttr &attr, QualType type) {
	Declarator &declarator = state.getDeclarator();

	// Move it to the outermost normal or block pointer declarator.
	for (unsigned i = state.getCurrentChunkIndex(); i != 0; --i) {
	DeclaratorChunk &chunk = declarator.getTypeObject(i-1);
	switch (chunk.Kind) {
	case DeclaratorChunk::Pointer:
	case DeclaratorChunk::BlockPointer: {
	// But don't move an ARC ownership attribute to the return type
	// of a block.
	DeclaratorChunk *destChunk = nullptr;
	if (state.isProcessingDeclSpec() &&
	attr.getKind() == ParsedAttr::AT_ObjCOwnership)
	destChunk = maybeMovePastReturnType(declarator, i - 1,
	/onlyBlockPointers=/true);
	if (!destChunk) destChunk = &chunk;

	moveAttrFromListToList(attr, state.getCurrentAttributes(),
	destChunk->getAttrs());
	return;
	}

	case DeclaratorChunk::Paren:
	case DeclaratorChunk::Array:
	continue;

	// We may be starting at the return type of a block.
	case DeclaratorChunk::Function:
	if (state.isProcessingDeclSpec() &&
	attr.getKind() == ParsedAttr::AT_ObjCOwnership) {
	if (DeclaratorChunk *dest = maybeMovePastReturnType(
	declarator, i,
	/onlyBlockPointers=/true)) {
	moveAttrFromListToList(attr, state.getCurrentAttributes(),
	dest->getAttrs());
	return;
	}
	}
	goto error;

	// Don't walk through these.
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pipe:
	goto error;
	}
	}
	error:

	diagnoseBadTypeAttribute(state.getSema(), attr, type);
	}

	/// Distribute an objc_gc type attribute that was written on the
	/// declarator.
	static void distributeObjCPointerTypeAttrFromDeclarator(
	TypeProcessingState &state, ParsedAttr &attr, QualType &declSpecType) {
	Declarator &declarator = state.getDeclarator();

	// objc_gc goes on the innermost pointer to something that's not a
	// pointer.
	unsigned innermost = -1U;
	bool considerDeclSpec = true;
	for (unsigned i = 0, e = declarator.getNumTypeObjects(); i != e; ++i) {
	DeclaratorChunk &chunk = declarator.getTypeObject(i);
	switch (chunk.Kind) {
	case DeclaratorChunk::Pointer:
	case DeclaratorChunk::BlockPointer:
	innermost = i;
	continue;

	case DeclaratorChunk::Reference:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Paren:
	case DeclaratorChunk::Array:
	case DeclaratorChunk::Pipe:
	continue;

	case DeclaratorChunk::Function:
	considerDeclSpec = false;
	goto done;
	}
	}
	done:

	// That might actually be the decl spec if we weren't blocked by
	// anything in the declarator.
	if (considerDeclSpec) {
	if (handleObjCPointerTypeAttr(state, attr, declSpecType)) {
	// Splice the attribute into the decl spec. Prevents the
	// attribute from being applied multiple times and gives
	// the source-location-filler something to work with.
	state.saveDeclSpecAttrs();
	declarator.getMutableDeclSpec().getAttributes().takeOneFrom(
	declarator.getAttributes(), &attr);
	return;
	}
	}

	// Otherwise, if we found an appropriate chunk, splice the attribute
	// into it.
	if (innermost != -1U) {
	moveAttrFromListToList(attr, declarator.getAttributes(),
	declarator.getTypeObject(innermost).getAttrs());
	return;
	}

	// Otherwise, diagnose when we're done building the type.
	declarator.getAttributes().remove(&attr);
	state.addIgnoredTypeAttr(attr);
	}

	/// A function type attribute was written somewhere in a declaration
	/// other than on the declarator itself or in the decl spec. Given
	/// that it didn't apply in whatever position it was written in, try
	/// to move it to a more appropriate position.
	static void distributeFunctionTypeAttr(TypeProcessingState &state,
	ParsedAttr &attr, QualType type) {
	Declarator &declarator = state.getDeclarator();

	// Try to push the attribute from the return type of a function to
	// the function itself.
	for (unsigned i = state.getCurrentChunkIndex(); i != 0; --i) {
	DeclaratorChunk &chunk = declarator.getTypeObject(i-1);
	switch (chunk.Kind) {
	case DeclaratorChunk::Function:
	moveAttrFromListToList(attr, state.getCurrentAttributes(),
	chunk.getAttrs());
	return;

	case DeclaratorChunk::Paren:
	case DeclaratorChunk::Pointer:
	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::Array:
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pipe:
	continue;
	}
	}

	diagnoseBadTypeAttribute(state.getSema(), attr, type);
	}

	/// Try to distribute a function type attribute to the innermost
	/// function chunk or type. Returns true if the attribute was
	/// distributed, false if no location was found.
	static bool distributeFunctionTypeAttrToInnermost(
	TypeProcessingState &state, ParsedAttr &attr,
	ParsedAttributesView &attrList, QualType &declSpecType) {
	Declarator &declarator = state.getDeclarator();

	// Put it on the innermost function chunk, if there is one.
	for (unsigned i = 0, e = declarator.getNumTypeObjects(); i != e; ++i) {
	DeclaratorChunk &chunk = declarator.getTypeObject(i);
	if (chunk.Kind != DeclaratorChunk::Function) continue;

	moveAttrFromListToList(attr, attrList, chunk.getAttrs());
	return true;
	}

	return handleFunctionTypeAttr(state, attr, declSpecType);
	}

	/// A function type attribute was written in the decl spec. Try to
	/// apply it somewhere.
	static void distributeFunctionTypeAttrFromDeclSpec(TypeProcessingState &state,
	ParsedAttr &attr,
	QualType &declSpecType) {
	state.saveDeclSpecAttrs();

	// C++11 attributes before the decl specifiers actually appertain to
	// the declarators. Move them straight there. We don't support the
	// 'put them wherever you like' semantics we allow for GNU attributes.
	if (attr.isCXX11Attribute()) {
	moveAttrFromListToList(attr, state.getCurrentAttributes(),
	state.getDeclarator().getAttributes());
	return;
	}

	// Try to distribute to the innermost.
	if (distributeFunctionTypeAttrToInnermost(
	state, attr, state.getCurrentAttributes(), declSpecType))
	return;

	// If that failed, diagnose the bad attribute when the declarator is
	// fully built.
	state.addIgnoredTypeAttr(attr);
	}

	/// A function type attribute was written on the declarator. Try to
	/// apply it somewhere.
	static void distributeFunctionTypeAttrFromDeclarator(TypeProcessingState &state,
	ParsedAttr &attr,
	QualType &declSpecType) {
	Declarator &declarator = state.getDeclarator();

	// Try to distribute to the innermost.
	if (distributeFunctionTypeAttrToInnermost(
	state, attr, declarator.getAttributes(), declSpecType))
	return;

	// If that failed, diagnose the bad attribute when the declarator is
	// fully built.
	declarator.getAttributes().remove(&attr);
	state.addIgnoredTypeAttr(attr);
	}

	/// Given that there are attributes written on the declarator
	/// itself, try to distribute any type attributes to the appropriate
	/// declarator chunk.
	///
	/// These are attributes like the following:
	/// int f ATTR;
	/// int (f ATTR)();
	/// but not necessarily this:
	/// int f() ATTR;
	static void distributeTypeAttrsFromDeclarator(TypeProcessingState &state,
	QualType &declSpecType) {
	// Collect all the type attributes from the declarator itself.
	assert(!state.getDeclarator().getAttributes().empty() &&
	"declarator has no attrs!");
	// The called functions in this loop actually remove things from the current
	// list, so iterating over the existing list isn't possible. Instead, make a
	// non-owning copy and iterate over that.
	ParsedAttributesView AttrsCopy{state.getDeclarator().getAttributes()};
	for (ParsedAttr &attr : AttrsCopy) {
	// Do not distribute C++11 attributes. They have strict rules for what
	// they appertain to.
	if (attr.isCXX11Attribute())
	continue;

	switch (attr.getKind()) {
	OBJC_POINTER_TYPE_ATTRS_CASELIST:
	distributeObjCPointerTypeAttrFromDeclarator(state, attr, declSpecType);
	break;

	FUNCTION_TYPE_ATTRS_CASELIST:
	distributeFunctionTypeAttrFromDeclarator(state, attr, declSpecType);
	break;

	MS_TYPE_ATTRS_CASELIST:
	// Microsoft type attributes cannot go after the declarator-id.
	continue;

	NULLABILITY_TYPE_ATTRS_CASELIST:
	// Nullability specifiers cannot go after the declarator-id.

	// Objective-C __kindof does not get distributed.
	case ParsedAttr::AT_ObjCKindOf:
	continue;

	default:
	break;
	}
	}
	}

	/// Add a synthetic '()' to a block-literal declarator if it is
	/// required, given the return type.
	static void maybeSynthesizeBlockSignature(TypeProcessingState &state,
	QualType declSpecType) {
	Declarator &declarator = state.getDeclarator();

	// First, check whether the declarator would produce a function,
	// i.e. whether the innermost semantic chunk is a function.
	if (declarator.isFunctionDeclarator()) {
	// If so, make that declarator a prototyped declarator.
	declarator.getFunctionTypeInfo().hasPrototype = true;
	return;
	}

	// If there are any type objects, the type as written won't name a
	// function, regardless of the decl spec type. This is because a
	// block signature declarator is always an abstract-declarator, and
	// abstract-declarators can't just be parentheses chunks. Therefore
	// we need to build a function chunk unless there are no type
	// objects and the decl spec type is a function.
	if (!declarator.getNumTypeObjects() && declSpecType->isFunctionType())
	return;

	// Note that there are cases with invalid declarators where
	// declarators consist solely of parentheses. In general, these
	// occur only in failed efforts to make function declarators, so
	// faking up the function chunk is still the right thing to do.

	// Otherwise, we need to fake up a function declarator.
	SourceLocation loc = declarator.getBeginLoc();

	// ...and prepend it to the declarator.
	SourceLocation NoLoc;
	declarator.AddInnermostTypeInfo(DeclaratorChunk::getFunction(
	/HasProto=/true,
	/IsAmbiguous=/false,
	/LParenLoc=/NoLoc,
	/ArgInfo=/nullptr,
	/NumParams=/0,
	/EllipsisLoc=/NoLoc,
	/RParenLoc=/NoLoc,
	/RefQualifierIsLvalueRef=/true,
	/RefQualifierLoc=/NoLoc,
	/MutableLoc=/NoLoc, EST_None,
	/ESpecRange=/SourceRange(),
	/Exceptions=/nullptr,
	/ExceptionRanges=/nullptr,
	/NumExceptions=/0,
	/NoexceptExpr=/nullptr,
	/ExceptionSpecTokens=/nullptr,
	/DeclsInPrototype=/None, loc, loc, declarator));

	// For consistency, make sure the state still has us as processing
	// the decl spec.
	assert(state.getCurrentChunkIndex() == declarator.getNumTypeObjects() - 1);
	state.setCurrentChunkIndex(declarator.getNumTypeObjects());
	}

	static void diagnoseAndRemoveTypeQualifiers(Sema &S, const DeclSpec &DS,
	unsigned &TypeQuals,
	QualType TypeSoFar,
	unsigned RemoveTQs,
	unsigned DiagID) {
	// If this occurs outside a template instantiation, warn the user about
	// it; they probably didn't mean to specify a redundant qualifier.
	typedef std::pair<DeclSpec::TQ, SourceLocation> QualLoc;
	for (QualLoc Qual : {QualLoc(DeclSpec::TQ_const, DS.getConstSpecLoc()),
	QualLoc(DeclSpec::TQ_restrict, DS.getRestrictSpecLoc()),
	QualLoc(DeclSpec::TQ_volatile, DS.getVolatileSpecLoc()),
	QualLoc(DeclSpec::TQ_atomic, DS.getAtomicSpecLoc())}) {
	if (!(RemoveTQs & Qual.first))
	continue;

	if (!S.inTemplateInstantiation()) {
	if (TypeQuals & Qual.first)
	S.Diag(Qual.second, DiagID)
	<< DeclSpec::getSpecifierName(Qual.first) << TypeSoFar
	<< FixItHint::CreateRemoval(Qual.second);
	}

	TypeQuals &= ~Qual.first;
	}
	}

	/// Return true if this is omitted block return type. Also check type
	/// attributes and type qualifiers when returning true.
	static bool checkOmittedBlockReturnType(Sema &S, Declarator &declarator,
	QualType Result) {
	if (!isOmittedBlockReturnType(declarator))
	return false;

	// Warn if we see type attributes for omitted return type on a block literal.
	SmallVector<ParsedAttr *, 2> ToBeRemoved;
	for (ParsedAttr &AL : declarator.getMutableDeclSpec().getAttributes()) {
	if (AL.isInvalid() \|\| !AL.isTypeAttr())
	continue;
	S.Diag(AL.getLoc(),
	diag::warn_block_literal_attributes_on_omitted_return_type)
	<< AL.getName();
	ToBeRemoved.push_back(&AL);
	}
	// Remove bad attributes from the list.
	for (ParsedAttr *AL : ToBeRemoved)
	declarator.getMutableDeclSpec().getAttributes().remove(AL);

	// Warn if we see type qualifiers for omitted return type on a block literal.
	const DeclSpec &DS = declarator.getDeclSpec();
	unsigned TypeQuals = DS.getTypeQualifiers();
	diagnoseAndRemoveTypeQualifiers(S, DS, TypeQuals, Result, (unsigned)-1,
	diag::warn_block_literal_qualifiers_on_omitted_return_type);
	declarator.getMutableDeclSpec().ClearTypeQualifiers();

	return true;
	}

	/// Apply Objective-C type arguments to the given type.
	static QualType applyObjCTypeArgs(Sema &S, SourceLocation loc, QualType type,
	ArrayRef<TypeSourceInfo *> typeArgs,
	SourceRange typeArgsRange,
	bool failOnError = false) {
	// We can only apply type arguments to an Objective-C class type.
	const auto *objcObjectType = type->getAs<ObjCObjectType>();
	if (!objcObjectType \|\| !objcObjectType->getInterface()) {
	S.Diag(loc, diag::err_objc_type_args_non_class)
	<< type
	<< typeArgsRange;

	if (failOnError)
	return QualType();
	return type;
	}

	// The class type must be parameterized.
	ObjCInterfaceDecl *objcClass = objcObjectType->getInterface();
	ObjCTypeParamList *typeParams = objcClass->getTypeParamList();
	if (!typeParams) {
	S.Diag(loc, diag::err_objc_type_args_non_parameterized_class)
	<< objcClass->getDeclName()
	<< FixItHint::CreateRemoval(typeArgsRange);

	if (failOnError)
	return QualType();

	return type;
	}

	// The type must not already be specialized.
	if (objcObjectType->isSpecialized()) {
	S.Diag(loc, diag::err_objc_type_args_specialized_class)
	<< type
	<< FixItHint::CreateRemoval(typeArgsRange);

	if (failOnError)
	return QualType();

	return type;
	}

	// Check the type arguments.
	SmallVector<QualType, 4> finalTypeArgs;
	unsigned numTypeParams = typeParams->size();
	bool anyPackExpansions = false;
	for (unsigned i = 0, n = typeArgs.size(); i != n; ++i) {
	TypeSourceInfo *typeArgInfo = typeArgs[i];
	QualType typeArg = typeArgInfo->getType();

	// Type arguments cannot have explicit qualifiers or nullability.
	// We ignore indirect sources of these, e.g. behind typedefs or
	// template arguments.
	if (TypeLoc qual = typeArgInfo->getTypeLoc().findExplicitQualifierLoc()) {
	bool diagnosed = false;
	SourceRange rangeToRemove;
	if (auto attr = qual.getAs<AttributedTypeLoc>()) {
	rangeToRemove = attr.getLocalSourceRange();
	if (attr.getTypePtr()->getImmediateNullability()) {
	typeArg = attr.getTypePtr()->getModifiedType();
	S.Diag(attr.getBeginLoc(),
	diag::err_objc_type_arg_explicit_nullability)
	<< typeArg << FixItHint::CreateRemoval(rangeToRemove);
	diagnosed = true;
	}
	}

	if (!diagnosed) {
	S.Diag(qual.getBeginLoc(), diag::err_objc_type_arg_qualified)
	<< typeArg << typeArg.getQualifiers().getAsString()
	<< FixItHint::CreateRemoval(rangeToRemove);
	}
	}

	// Remove qualifiers even if they're non-local.
	typeArg = typeArg.getUnqualifiedType();

	finalTypeArgs.push_back(typeArg);

	if (typeArg->getAs<PackExpansionType>())
	anyPackExpansions = true;

	// Find the corresponding type parameter, if there is one.
	ObjCTypeParamDecl *typeParam = nullptr;
	if (!anyPackExpansions) {
	if (i < numTypeParams) {
	typeParam = typeParams->begin()[i];
	} else {
	// Too many arguments.
	S.Diag(loc, diag::err_objc_type_args_wrong_arity)
	<< false
	<< objcClass->getDeclName()
	<< (unsigned)typeArgs.size()
	<< numTypeParams;
	S.Diag(objcClass->getLocation(), diag::note_previous_decl)
	<< objcClass;

	if (failOnError)
	return QualType();

	return type;
	}
	}

	// Objective-C object pointer types must be substitutable for the bounds.
	if (const auto *typeArgObjC = typeArg->getAs<ObjCObjectPointerType>()) {
	// If we don't have a type parameter to match against, assume
	// everything is fine. There was a prior pack expansion that
	// means we won't be able to match anything.
	if (!typeParam) {
	assert(anyPackExpansions && "Too many arguments?");
	continue;
	}

	// Retrieve the bound.
	QualType bound = typeParam->getUnderlyingType();
	const auto *boundObjC = bound->getAs<ObjCObjectPointerType>();

	// Determine whether the type argument is substitutable for the bound.
	if (typeArgObjC->isObjCIdType()) {
	// When the type argument is 'id', the only acceptable type
	// parameter bound is 'id'.
	if (boundObjC->isObjCIdType())
	continue;
	} else if (S.Context.canAssignObjCInterfaces(boundObjC, typeArgObjC)) {
	// Otherwise, we follow the assignability rules.
	continue;
	}

	// Diagnose the mismatch.
	S.Diag(typeArgInfo->getTypeLoc().getBeginLoc(),
	diag::err_objc_type_arg_does_not_match_bound)
	<< typeArg << bound << typeParam->getDeclName();
	S.Diag(typeParam->getLocation(), diag::note_objc_type_param_here)
	<< typeParam->getDeclName();

	if (failOnError)
	return QualType();

	return type;
	}

	// Block pointer types are permitted for unqualified 'id' bounds.
	if (typeArg->isBlockPointerType()) {
	// If we don't have a type parameter to match against, assume
	// everything is fine. There was a prior pack expansion that
	// means we won't be able to match anything.
	if (!typeParam) {
	assert(anyPackExpansions && "Too many arguments?");
	continue;
	}

	// Retrieve the bound.
	QualType bound = typeParam->getUnderlyingType();
	if (bound->isBlockCompatibleObjCPointerType(S.Context))
	continue;

	// Diagnose the mismatch.
	S.Diag(typeArgInfo->getTypeLoc().getBeginLoc(),
	diag::err_objc_type_arg_does_not_match_bound)
	<< typeArg << bound << typeParam->getDeclName();
	S.Diag(typeParam->getLocation(), diag::note_objc_type_param_here)
	<< typeParam->getDeclName();

	if (failOnError)
	return QualType();

	return type;
	}

	// Dependent types will be checked at instantiation time.
	if (typeArg->isDependentType()) {
	continue;
	}

	// Diagnose non-id-compatible type arguments.
	S.Diag(typeArgInfo->getTypeLoc().getBeginLoc(),
	diag::err_objc_type_arg_not_id_compatible)
	<< typeArg << typeArgInfo->getTypeLoc().getSourceRange();

	if (failOnError)
	return QualType();

	return type;
	}

	// Make sure we didn't have the wrong number of arguments.
	if (!anyPackExpansions && finalTypeArgs.size() != numTypeParams) {
	S.Diag(loc, diag::err_objc_type_args_wrong_arity)
	<< (typeArgs.size() < typeParams->size())
	<< objcClass->getDeclName()
	<< (unsigned)finalTypeArgs.size()
	<< (unsigned)numTypeParams;
	S.Diag(objcClass->getLocation(), diag::note_previous_decl)
	<< objcClass;

	if (failOnError)
	return QualType();

	return type;
	}

	// Success. Form the specialized type.
	return S.Context.getObjCObjectType(type, finalTypeArgs, { }, false);
	}

	QualType Sema::BuildObjCTypeParamType(const ObjCTypeParamDecl *Decl,
	SourceLocation ProtocolLAngleLoc,
	ArrayRef<ObjCProtocolDecl *> Protocols,
	ArrayRef<SourceLocation> ProtocolLocs,
	SourceLocation ProtocolRAngleLoc,
	bool FailOnError) {
	QualType Result = QualType(Decl->getTypeForDecl(), 0);
	if (!Protocols.empty()) {
	bool HasError;
	Result = Context.applyObjCProtocolQualifiers(Result, Protocols,
	HasError);
	if (HasError) {
	Diag(SourceLocation(), diag::err_invalid_protocol_qualifiers)
	<< SourceRange(ProtocolLAngleLoc, ProtocolRAngleLoc);
	if (FailOnError) Result = QualType();
	}
	if (FailOnError && Result.isNull())
	return QualType();
	}

	return Result;
	}

	QualType Sema::BuildObjCObjectType(QualType BaseType,
	SourceLocation Loc,
	SourceLocation TypeArgsLAngleLoc,
	ArrayRef<TypeSourceInfo *> TypeArgs,
	SourceLocation TypeArgsRAngleLoc,
	SourceLocation ProtocolLAngleLoc,
	ArrayRef<ObjCProtocolDecl *> Protocols,
	ArrayRef<SourceLocation> ProtocolLocs,
	SourceLocation ProtocolRAngleLoc,
	bool FailOnError) {
	QualType Result = BaseType;
	if (!TypeArgs.empty()) {
	Result = applyObjCTypeArgs(*this, Loc, Result, TypeArgs,
	SourceRange(TypeArgsLAngleLoc,
	TypeArgsRAngleLoc),
	FailOnError);
	if (FailOnError && Result.isNull())
	return QualType();
	}

	if (!Protocols.empty()) {
	bool HasError;
	Result = Context.applyObjCProtocolQualifiers(Result, Protocols,
	HasError);
	if (HasError) {
	Diag(Loc, diag::err_invalid_protocol_qualifiers)
	<< SourceRange(ProtocolLAngleLoc, ProtocolRAngleLoc);
	if (FailOnError) Result = QualType();
	}
	if (FailOnError && Result.isNull())
	return QualType();
	}

	return Result;
	}

	TypeResult Sema::actOnObjCProtocolQualifierType(
	SourceLocation lAngleLoc,
	ArrayRef<Decl *> protocols,
	ArrayRef<SourceLocation> protocolLocs,
	SourceLocation rAngleLoc) {
	// Form id<protocol-list>.
	QualType Result = Context.getObjCObjectType(
	Context.ObjCBuiltinIdTy, { },
	llvm::makeArrayRef(
	(ObjCProtocolDecl * const *)protocols.data(),
	protocols.size()),
	false);
	Result = Context.getObjCObjectPointerType(Result);

	TypeSourceInfo *ResultTInfo = Context.CreateTypeSourceInfo(Result);
	TypeLoc ResultTL = ResultTInfo->getTypeLoc();

	auto ObjCObjectPointerTL = ResultTL.castAs<ObjCObjectPointerTypeLoc>();
	ObjCObjectPointerTL.setStarLoc(SourceLocation()); // implicit

	auto ObjCObjectTL = ObjCObjectPointerTL.getPointeeLoc()
	.castAs<ObjCObjectTypeLoc>();
	ObjCObjectTL.setHasBaseTypeAsWritten(false);
	ObjCObjectTL.getBaseLoc().initialize(Context, SourceLocation());

	// No type arguments.
	ObjCObjectTL.setTypeArgsLAngleLoc(SourceLocation());
	ObjCObjectTL.setTypeArgsRAngleLoc(SourceLocation());

	// Fill in protocol qualifiers.
	ObjCObjectTL.setProtocolLAngleLoc(lAngleLoc);
	ObjCObjectTL.setProtocolRAngleLoc(rAngleLoc);
	for (unsigned i = 0, n = protocols.size(); i != n; ++i)
	ObjCObjectTL.setProtocolLoc(i, protocolLocs[i]);

	// We're done. Return the completed type to the parser.
	return CreateParsedType(Result, ResultTInfo);
	}

	TypeResult Sema::actOnObjCTypeArgsAndProtocolQualifiers(
	Scope *S,
	SourceLocation Loc,
	ParsedType BaseType,
	SourceLocation TypeArgsLAngleLoc,
	ArrayRef<ParsedType> TypeArgs,
	SourceLocation TypeArgsRAngleLoc,
	SourceLocation ProtocolLAngleLoc,
	ArrayRef<Decl *> Protocols,
	ArrayRef<SourceLocation> ProtocolLocs,
	SourceLocation ProtocolRAngleLoc) {
	TypeSourceInfo *BaseTypeInfo = nullptr;
	QualType T = GetTypeFromParser(BaseType, &BaseTypeInfo);
	if (T.isNull())
	return true;

	// Handle missing type-source info.
	if (!BaseTypeInfo)
	BaseTypeInfo = Context.getTrivialTypeSourceInfo(T, Loc);

	// Extract type arguments.
	SmallVector<TypeSourceInfo *, 4> ActualTypeArgInfos;
	for (unsigned i = 0, n = TypeArgs.size(); i != n; ++i) {
	TypeSourceInfo *TypeArgInfo = nullptr;
	QualType TypeArg = GetTypeFromParser(TypeArgs[i], &TypeArgInfo);
	if (TypeArg.isNull()) {
	ActualTypeArgInfos.clear();
	break;
	}

	assert(TypeArgInfo && "No type source info?");
	ActualTypeArgInfos.push_back(TypeArgInfo);
	}

	// Build the object type.
	QualType Result = BuildObjCObjectType(
	T, BaseTypeInfo->getTypeLoc().getSourceRange().getBegin(),
	TypeArgsLAngleLoc, ActualTypeArgInfos, TypeArgsRAngleLoc,
	ProtocolLAngleLoc,
	llvm::makeArrayRef((ObjCProtocolDecl * const *)Protocols.data(),
	Protocols.size()),
	ProtocolLocs, ProtocolRAngleLoc,
	/FailOnError=/false);

	if (Result == T)
	return BaseType;

	// Create source information for this type.
	TypeSourceInfo *ResultTInfo = Context.CreateTypeSourceInfo(Result);
	TypeLoc ResultTL = ResultTInfo->getTypeLoc();

	// For id<Proto1, Proto2> or Class<Proto1, Proto2>, we'll have an
	// object pointer type. Fill in source information for it.
	if (auto ObjCObjectPointerTL = ResultTL.getAs<ObjCObjectPointerTypeLoc>()) {
	// The '*' is implicit.
	ObjCObjectPointerTL.setStarLoc(SourceLocation());
	ResultTL = ObjCObjectPointerTL.getPointeeLoc();
	}

	if (auto OTPTL = ResultTL.getAs<ObjCTypeParamTypeLoc>()) {
	// Protocol qualifier information.
	if (OTPTL.getNumProtocols() > 0) {
	assert(OTPTL.getNumProtocols() == Protocols.size());
	OTPTL.setProtocolLAngleLoc(ProtocolLAngleLoc);
	OTPTL.setProtocolRAngleLoc(ProtocolRAngleLoc);
	for (unsigned i = 0, n = Protocols.size(); i != n; ++i)
	OTPTL.setProtocolLoc(i, ProtocolLocs[i]);
	}

	// We're done. Return the completed type to the parser.
	return CreateParsedType(Result, ResultTInfo);
	}

	auto ObjCObjectTL = ResultTL.castAs<ObjCObjectTypeLoc>();

	// Type argument information.
	if (ObjCObjectTL.getNumTypeArgs() > 0) {
	assert(ObjCObjectTL.getNumTypeArgs() == ActualTypeArgInfos.size());
	ObjCObjectTL.setTypeArgsLAngleLoc(TypeArgsLAngleLoc);
	ObjCObjectTL.setTypeArgsRAngleLoc(TypeArgsRAngleLoc);
	for (unsigned i = 0, n = ActualTypeArgInfos.size(); i != n; ++i)
	ObjCObjectTL.setTypeArgTInfo(i, ActualTypeArgInfos[i]);
	} else {
	ObjCObjectTL.setTypeArgsLAngleLoc(SourceLocation());
	ObjCObjectTL.setTypeArgsRAngleLoc(SourceLocation());
	}

	// Protocol qualifier information.
	if (ObjCObjectTL.getNumProtocols() > 0) {
	assert(ObjCObjectTL.getNumProtocols() == Protocols.size());
	ObjCObjectTL.setProtocolLAngleLoc(ProtocolLAngleLoc);
	ObjCObjectTL.setProtocolRAngleLoc(ProtocolRAngleLoc);
	for (unsigned i = 0, n = Protocols.size(); i != n; ++i)
	ObjCObjectTL.setProtocolLoc(i, ProtocolLocs[i]);
	} else {
	ObjCObjectTL.setProtocolLAngleLoc(SourceLocation());
	ObjCObjectTL.setProtocolRAngleLoc(SourceLocation());
	}

	// Base type.
	ObjCObjectTL.setHasBaseTypeAsWritten(true);
	if (ObjCObjectTL.getType() == T)
	ObjCObjectTL.getBaseLoc().initializeFullCopy(BaseTypeInfo->getTypeLoc());
	else
	ObjCObjectTL.getBaseLoc().initialize(Context, Loc);

	// We're done. Return the completed type to the parser.
	return CreateParsedType(Result, ResultTInfo);
	}

	static OpenCLAccessAttr::Spelling
	getImageAccess(const ParsedAttributesView &Attrs) {
	for (const ParsedAttr &AL : Attrs)
	if (AL.getKind() == ParsedAttr::AT_OpenCLAccess)
	return static_cast<OpenCLAccessAttr::Spelling>(AL.getSemanticSpelling());
	return OpenCLAccessAttr::Keyword_read_only;
	}

	/// Convert the specified declspec to the appropriate type
	/// object.
	/// \param state Specifies the declarator containing the declaration specifier
	/// to be converted, along with other associated processing state.
	/// \returns The type described by the declaration specifiers. This function
	/// never returns null.
	static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
	// FIXME: Should move the logic from DeclSpec::Finish to here for validity
	// checking.

	Sema &S = state.getSema();
	Declarator &declarator = state.getDeclarator();
	DeclSpec &DS = declarator.getMutableDeclSpec();
	SourceLocation DeclLoc = declarator.getIdentifierLoc();
	if (DeclLoc.isInvalid())
	DeclLoc = DS.getBeginLoc();

	ASTContext &Context = S.Context;

	QualType Result;
	switch (DS.getTypeSpecType()) {
	case DeclSpec::TST_void:
	Result = Context.VoidTy;
	break;
	case DeclSpec::TST_char:
	if (DS.getTypeSpecSign() == DeclSpec::TSS_unspecified)
	Result = Context.CharTy;
	else if (DS.getTypeSpecSign() == DeclSpec::TSS_signed)
	Result = Context.SignedCharTy;
	else {
	assert(DS.getTypeSpecSign() == DeclSpec::TSS_unsigned &&
	"Unknown TSS value");
	Result = Context.UnsignedCharTy;
	}
	break;
	case DeclSpec::TST_wchar:
	if (DS.getTypeSpecSign() == DeclSpec::TSS_unspecified)
	Result = Context.WCharTy;
	else if (DS.getTypeSpecSign() == DeclSpec::TSS_signed) {
	S.Diag(DS.getTypeSpecSignLoc(), diag::ext_invalid_sign_spec)
	<< DS.getSpecifierName(DS.getTypeSpecType(),
	Context.getPrintingPolicy());
	Result = Context.getSignedWCharType();
	} else {
	assert(DS.getTypeSpecSign() == DeclSpec::TSS_unsigned &&
	"Unknown TSS value");
	S.Diag(DS.getTypeSpecSignLoc(), diag::ext_invalid_sign_spec)
	<< DS.getSpecifierName(DS.getTypeSpecType(),
	Context.getPrintingPolicy());
	Result = Context.getUnsignedWCharType();
	}
	break;
	case DeclSpec::TST_char8:
	assert(DS.getTypeSpecSign() == DeclSpec::TSS_unspecified &&
	"Unknown TSS value");
	Result = Context.Char8Ty;
	break;
	case DeclSpec::TST_char16:
	assert(DS.getTypeSpecSign() == DeclSpec::TSS_unspecified &&
	"Unknown TSS value");
	Result = Context.Char16Ty;
	break;
	case DeclSpec::TST_char32:
	assert(DS.getTypeSpecSign() == DeclSpec::TSS_unspecified &&
	"Unknown TSS value");
	Result = Context.Char32Ty;
	break;
	case DeclSpec::TST_unspecified:
	// If this is a missing declspec in a block literal return context, then it
	// is inferred from the return statements inside the block.
	// The declspec is always missing in a lambda expr context; it is either
	// specified with a trailing return type or inferred.
	if (S.getLangOpts().CPlusPlus14 &&
	declarator.getContext() == DeclaratorContext::LambdaExprContext) {
	// In C++1y, a lambda's implicit return type is 'auto'.
	Result = Context.getAutoDeductType();
	break;
	} else if (declarator.getContext() ==
	DeclaratorContext::LambdaExprContext \|\|
	checkOmittedBlockReturnType(S, declarator,
	Context.DependentTy)) {
	Result = Context.DependentTy;
	break;
	}

	// Unspecified typespec defaults to int in C90. However, the C90 grammar
	// [C90 6.5] only allows a decl-spec if there was some type-specifier,
	// type-qualifier, or storage-class-specifier. If not, emit an extwarn.
	// Note that the one exception to this is function definitions, which are
	// allowed to be completely missing a declspec. This is handled in the
	// parser already though by it pretending to have seen an 'int' in this
	// case.
	if (S.getLangOpts().ImplicitInt) {
	// In C89 mode, we only warn if there is a completely missing declspec
	// when one is not allowed.
	if (DS.isEmpty()) {
	S.Diag(DeclLoc, diag::ext_missing_declspec)
	<< DS.getSourceRange()
	<< FixItHint::CreateInsertion(DS.getBeginLoc(), "int");
	}
	} else if (!DS.hasTypeSpecifier()) {
	// C99 and C++ require a type specifier. For example, C99 6.7.2p2 says:
	// "At least one type specifier shall be given in the declaration
	// specifiers in each declaration, and in the specifier-qualifier list in
	// each struct declaration and type name."
	if (S.getLangOpts().CPlusPlus && !DS.isTypeSpecPipe()) {
	S.Diag(DeclLoc, diag::err_missing_type_specifier)
	<< DS.getSourceRange();

	// When this occurs in C++ code, often something is very broken with the
	// value being declared, poison it as invalid so we don't get chains of
	// errors.
	declarator.setInvalidType(true);
	} else if ((S.getLangOpts().OpenCLVersion >= 200 \|\|
	S.getLangOpts().OpenCLCPlusPlus) &&
	DS.isTypeSpecPipe()) {
	S.Diag(DeclLoc, diag::err_missing_actual_pipe_type)
	<< DS.getSourceRange();
	declarator.setInvalidType(true);
	} else {
	S.Diag(DeclLoc, diag::ext_missing_type_specifier)
	<< DS.getSourceRange();
	}
	}

	LLVM_FALLTHROUGH;
	case DeclSpec::TST_int: {
	if (DS.getTypeSpecSign() != DeclSpec::TSS_unsigned) {
	switch (DS.getTypeSpecWidth()) {
	case DeclSpec::TSW_unspecified: Result = Context.IntTy; break;
	case DeclSpec::TSW_short: Result = Context.ShortTy; break;
	case DeclSpec::TSW_long: Result = Context.LongTy; break;
	case DeclSpec::TSW_longlong:
	Result = Context.LongLongTy;

	// 'long long' is a C99 or C++11 feature.
	if (!S.getLangOpts().C99) {
	if (S.getLangOpts().CPlusPlus)
	S.Diag(DS.getTypeSpecWidthLoc(),
	S.getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_longlong : diag::ext_cxx11_longlong);
	else
	S.Diag(DS.getTypeSpecWidthLoc(), diag::ext_c99_longlong);
	}
	break;
	}
	} else {
	switch (DS.getTypeSpecWidth()) {
	case DeclSpec::TSW_unspecified: Result = Context.UnsignedIntTy; break;
	case DeclSpec::TSW_short: Result = Context.UnsignedShortTy; break;
	case DeclSpec::TSW_long: Result = Context.UnsignedLongTy; break;
	case DeclSpec::TSW_longlong:
	Result = Context.UnsignedLongLongTy;

	// 'long long' is a C99 or C++11 feature.
	if (!S.getLangOpts().C99) {
	if (S.getLangOpts().CPlusPlus)
	S.Diag(DS.getTypeSpecWidthLoc(),
	S.getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_longlong : diag::ext_cxx11_longlong);
	else
	S.Diag(DS.getTypeSpecWidthLoc(), diag::ext_c99_longlong);
	}
	break;
	}
	}
	break;
	}
	case DeclSpec::TST_accum: {
	switch (DS.getTypeSpecWidth()) {
	case DeclSpec::TSW_short:
	Result = Context.ShortAccumTy;
	break;
	case DeclSpec::TSW_unspecified:
	Result = Context.AccumTy;
	break;
	case DeclSpec::TSW_long:
	Result = Context.LongAccumTy;
	break;
	case DeclSpec::TSW_longlong:
	llvm_unreachable("Unable to specify long long as _Accum width");
	}

	if (DS.getTypeSpecSign() == DeclSpec::TSS_unsigned)
	Result = Context.getCorrespondingUnsignedType(Result);

	if (DS.isTypeSpecSat())
	Result = Context.getCorrespondingSaturatedType(Result);

	break;
	}
	case DeclSpec::TST_fract: {
	switch (DS.getTypeSpecWidth()) {
	case DeclSpec::TSW_short:
	Result = Context.ShortFractTy;
	break;
	case DeclSpec::TSW_unspecified:
	Result = Context.FractTy;
	break;
	case DeclSpec::TSW_long:
	Result = Context.LongFractTy;
	break;
	case DeclSpec::TSW_longlong:
	llvm_unreachable("Unable to specify long long as _Fract width");
	}

	if (DS.getTypeSpecSign() == DeclSpec::TSS_unsigned)
	Result = Context.getCorrespondingUnsignedType(Result);

	if (DS.isTypeSpecSat())
	Result = Context.getCorrespondingSaturatedType(Result);

	break;
	}
	case DeclSpec::TST_int128:
	if (!S.Context.getTargetInfo().hasInt128Type() &&
	!(S.getLangOpts().OpenMP && S.getLangOpts().OpenMPIsDevice))
	S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_unsupported)
	<< "__int128";
	if (DS.getTypeSpecSign() == DeclSpec::TSS_unsigned)
	Result = Context.UnsignedInt128Ty;
	else
	Result = Context.Int128Ty;
	break;
	case DeclSpec::TST_float16:
	// CUDA host and device may have different _Float16 support, therefore
	// do not diagnose _Float16 usage to avoid false alarm.
	// ToDo: more precise diagnostics for CUDA.
	if (!S.Context.getTargetInfo().hasFloat16Type() && !S.getLangOpts().CUDA &&
	!(S.getLangOpts().OpenMP && S.getLangOpts().OpenMPIsDevice))
	S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_unsupported)
	<< "_Float16";
	Result = Context.Float16Ty;
	break;
	case DeclSpec::TST_half: Result = Context.HalfTy; break;
	case DeclSpec::TST_float: Result = Context.FloatTy; break;
	case DeclSpec::TST_double:
	if (DS.getTypeSpecWidth() == DeclSpec::TSW_long)
	Result = Context.LongDoubleTy;
	else
	Result = Context.DoubleTy;
	break;
	case DeclSpec::TST_float128:
	if (!S.Context.getTargetInfo().hasFloat128Type() &&
	!(S.getLangOpts().OpenMP && S.getLangOpts().OpenMPIsDevice))
	S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_unsupported)
	<< "__float128";
	Result = Context.Float128Ty;
	break;
	case DeclSpec::TST_bool: Result = Context.BoolTy; break; // _Bool or bool
	break;
	case DeclSpec::TST_decimal32: // _Decimal32
	case DeclSpec::TST_decimal64: // _Decimal64
	case DeclSpec::TST_decimal128: // _Decimal128
	S.Diag(DS.getTypeSpecTypeLoc(), diag::err_decimal_unsupported);
	Result = Context.IntTy;
	declarator.setInvalidType(true);
	break;
	case DeclSpec::TST_class:
	case DeclSpec::TST_enum:
	case DeclSpec::TST_union:
	case DeclSpec::TST_struct:
	case DeclSpec::TST_interface: {
	TagDecl *D = dyn_cast_or_null<TagDecl>(DS.getRepAsDecl());
	if (!D) {
	// This can happen in C++ with ambiguous lookups.
	Result = Context.IntTy;
	declarator.setInvalidType(true);
	break;
	}

	// If the type is deprecated or unavailable, diagnose it.
	S.DiagnoseUseOfDecl(D, DS.getTypeSpecTypeNameLoc());

	assert(DS.getTypeSpecWidth() == 0 && DS.getTypeSpecComplex() == 0 &&
	DS.getTypeSpecSign() == 0 && "No qualifiers on tag names!");

	// TypeQuals handled by caller.
	Result = Context.getTypeDeclType(D);

	// In both C and C++, make an ElaboratedType.
	ElaboratedTypeKeyword Keyword
	= ElaboratedType::getKeywordForTypeSpec(DS.getTypeSpecType());
	Result = S.getElaboratedType(Keyword, DS.getTypeSpecScope(), Result,
	DS.isTypeSpecOwned() ? D : nullptr);
	break;
	}
	case DeclSpec::TST_typename: {
	assert(DS.getTypeSpecWidth() == 0 && DS.getTypeSpecComplex() == 0 &&
	DS.getTypeSpecSign() == 0 &&
	"Can't handle qualifiers on typedef names yet!");
	Result = S.GetTypeFromParser(DS.getRepAsType());
	if (Result.isNull()) {
	declarator.setInvalidType(true);
	}

	// TypeQuals handled by caller.
	break;
	}
	case DeclSpec::TST_typeofType:
	// FIXME: Preserve type source info.
	Result = S.GetTypeFromParser(DS.getRepAsType());
	assert(!Result.isNull() && "Didn't get a type for typeof?");
	if (!Result->isDependentType())
	if (const TagType *TT = Result->getAs<TagType>())
	S.DiagnoseUseOfDecl(TT->getDecl(), DS.getTypeSpecTypeLoc());
	// TypeQuals handled by caller.
	Result = Context.getTypeOfType(Result);
	break;
	case DeclSpec::TST_typeofExpr: {
	Expr *E = DS.getRepAsExpr();
	assert(E && "Didn't get an expression for typeof?");
	// TypeQuals handled by caller.
	Result = S.BuildTypeofExprType(E, DS.getTypeSpecTypeLoc());
	if (Result.isNull()) {
	Result = Context.IntTy;
	declarator.setInvalidType(true);
	}
	break;
	}
	case DeclSpec::TST_decltype: {
	Expr *E = DS.getRepAsExpr();
	assert(E && "Didn't get an expression for decltype?");
	// TypeQuals handled by caller.
	Result = S.BuildDecltypeType(E, DS.getTypeSpecTypeLoc());
	if (Result.isNull()) {
	Result = Context.IntTy;
	declarator.setInvalidType(true);
	}
	break;
	}
	case DeclSpec::TST_underlyingType:
	Result = S.GetTypeFromParser(DS.getRepAsType());
	assert(!Result.isNull() && "Didn't get a type for __underlying_type?");
	Result = S.BuildUnaryTransformType(Result,
	UnaryTransformType::EnumUnderlyingType,
	DS.getTypeSpecTypeLoc());
	if (Result.isNull()) {
	Result = Context.IntTy;
	declarator.setInvalidType(true);
	}
	break;

	case DeclSpec::TST_auto:
	Result = Context.getAutoType(QualType(), AutoTypeKeyword::Auto, false);
	break;

	case DeclSpec::TST_auto_type:
	Result = Context.getAutoType(QualType(), AutoTypeKeyword::GNUAutoType, false);
	break;

	case DeclSpec::TST_decltype_auto:
	Result = Context.getAutoType(QualType(), AutoTypeKeyword::DecltypeAuto,
	/IsDependent/ false);
	break;

	case DeclSpec::TST_unknown_anytype:
	Result = Context.UnknownAnyTy;
	break;

	case DeclSpec::TST_atomic:
	Result = S.GetTypeFromParser(DS.getRepAsType());
	assert(!Result.isNull() && "Didn't get a type for _Atomic?");
	Result = S.BuildAtomicType(Result, DS.getTypeSpecTypeLoc());
	if (Result.isNull()) {
	Result = Context.IntTy;
	declarator.setInvalidType(true);
	}
	break;

	#define GENERIC_IMAGE_TYPE(ImgType, Id) \
	case DeclSpec::TST_##ImgType##_t: \
	switch (getImageAccess(DS.getAttributes())) { \
	case OpenCLAccessAttr::Keyword_write_only: \
	Result = Context.Id##WOTy; \
	break; \
	case OpenCLAccessAttr::Keyword_read_write: \
	Result = Context.Id##RWTy; \
	break; \
	case OpenCLAccessAttr::Keyword_read_only: \
	Result = Context.Id##ROTy; \
	break; \
	} \
	break;
	#include "clang/Basic/OpenCLImageTypes.def"

	case DeclSpec::TST_error:
	Result = Context.IntTy;
	declarator.setInvalidType(true);
	break;
	}

	if (S.getLangOpts().OpenCL &&
	S.checkOpenCLDisabledTypeDeclSpec(DS, Result))
	declarator.setInvalidType(true);

	bool IsFixedPointType = DS.getTypeSpecType() == DeclSpec::TST_accum \|\|
	DS.getTypeSpecType() == DeclSpec::TST_fract;

	// Only fixed point types can be saturated
	if (DS.isTypeSpecSat() && !IsFixedPointType)
	S.Diag(DS.getTypeSpecSatLoc(), diag::err_invalid_saturation_spec)
	<< DS.getSpecifierName(DS.getTypeSpecType(),
	Context.getPrintingPolicy());

	// Handle complex types.
	if (DS.getTypeSpecComplex() == DeclSpec::TSC_complex) {
	if (S.getLangOpts().Freestanding)
	S.Diag(DS.getTypeSpecComplexLoc(), diag::ext_freestanding_complex);
	Result = Context.getComplexType(Result);
	} else if (DS.isTypeAltiVecVector()) {
	unsigned typeSize = static_cast<unsigned>(Context.getTypeSize(Result));
	assert(typeSize > 0 && "type size for vector must be greater than 0 bits");
	VectorType::VectorKind VecKind = VectorType::AltiVecVector;
	if (DS.isTypeAltiVecPixel())
	VecKind = VectorType::AltiVecPixel;
	else if (DS.isTypeAltiVecBool())
	VecKind = VectorType::AltiVecBool;
	Result = Context.getVectorType(Result, 128/typeSize, VecKind);
	}

	// FIXME: Imaginary.
	if (DS.getTypeSpecComplex() == DeclSpec::TSC_imaginary)
	S.Diag(DS.getTypeSpecComplexLoc(), diag::err_imaginary_not_supported);

	// Before we process any type attributes, synthesize a block literal
	// function declarator if necessary.
	if (declarator.getContext() == DeclaratorContext::BlockLiteralContext)
	maybeSynthesizeBlockSignature(state, Result);

	// Apply any type attributes from the decl spec. This may cause the
	// list of type attributes to be temporarily saved while the type
	// attributes are pushed around.
	// pipe attributes will be handled later ( at GetFullTypeForDeclarator )
	if (!DS.isTypeSpecPipe())
	processTypeAttrs(state, Result, TAL_DeclSpec, DS.getAttributes());

	// Apply const/volatile/restrict qualifiers to T.
	if (unsigned TypeQuals = DS.getTypeQualifiers()) {
	// Warn about CV qualifiers on function types.
	// C99 6.7.3p8:
	// If the specification of a function type includes any type qualifiers,
	// the behavior is undefined.
	// C++11 [dcl.fct]p7:
	// The effect of a cv-qualifier-seq in a function declarator is not the
	// same as adding cv-qualification on top of the function type. In the
	// latter case, the cv-qualifiers are ignored.
	if (TypeQuals && Result->isFunctionType()) {
	diagnoseAndRemoveTypeQualifiers(
	S, DS, TypeQuals, Result, DeclSpec::TQ_const \| DeclSpec::TQ_volatile,
	S.getLangOpts().CPlusPlus
	? diag::warn_typecheck_function_qualifiers_ignored
	: diag::warn_typecheck_function_qualifiers_unspecified);
	// No diagnostic for 'restrict' or '_Atomic' applied to a
	// function type; we'll diagnose those later, in BuildQualifiedType.
	}

	// C++11 [dcl.ref]p1:
	// Cv-qualified references are ill-formed except when the
	// cv-qualifiers are introduced through the use of a typedef-name
	// or decltype-specifier, in which case the cv-qualifiers are ignored.
	//
	// There don't appear to be any other contexts in which a cv-qualified
	// reference type could be formed, so the 'ill-formed' clause here appears
	// to never happen.
	if (TypeQuals && Result->isReferenceType()) {
	diagnoseAndRemoveTypeQualifiers(
	S, DS, TypeQuals, Result,
	DeclSpec::TQ_const \| DeclSpec::TQ_volatile \| DeclSpec::TQ_atomic,
	diag::warn_typecheck_reference_qualifiers);
	}

	// C90 6.5.3 constraints: "The same type qualifier shall not appear more
	// than once in the same specifier-list or qualifier-list, either directly
	// or via one or more typedefs."
	if (!S.getLangOpts().C99 && !S.getLangOpts().CPlusPlus
	&& TypeQuals & Result.getCVRQualifiers()) {
	if (TypeQuals & DeclSpec::TQ_const && Result.isConstQualified()) {
	S.Diag(DS.getConstSpecLoc(), diag::ext_duplicate_declspec)
	<< "const";
	}

	if (TypeQuals & DeclSpec::TQ_volatile && Result.isVolatileQualified()) {
	S.Diag(DS.getVolatileSpecLoc(), diag::ext_duplicate_declspec)
	<< "volatile";
	}

	// C90 doesn't have restrict nor _Atomic, so it doesn't force us to
	// produce a warning in this case.
	}

	QualType Qualified = S.BuildQualifiedType(Result, DeclLoc, TypeQuals, &DS);

	// If adding qualifiers fails, just use the unqualified type.
	if (Qualified.isNull())
	declarator.setInvalidType(true);
	else
	Result = Qualified;
	}

	assert(!Result.isNull() && "This function should not return a null type");
	return Result;
	}

	static std::string getPrintableNameForEntity(DeclarationName Entity) {
	if (Entity)
	return Entity.getAsString();

	return "type name";
	}

	QualType Sema::BuildQualifiedType(QualType T, SourceLocation Loc,
	Qualifiers Qs, const DeclSpec *DS) {
	if (T.isNull())
	return QualType();

	// Ignore any attempt to form a cv-qualified reference.
	if (T->isReferenceType()) {
	Qs.removeConst();
	Qs.removeVolatile();
	}

	// Enforce C99 6.7.3p2: "Types other than pointer types derived from
	// object or incomplete types shall not be restrict-qualified."
	if (Qs.hasRestrict()) {
	unsigned DiagID = 0;
	QualType ProblemTy;

	if (T->isAnyPointerType() \|\| T->isReferenceType() \|\|
	T->isMemberPointerType()) {
	QualType EltTy;
	if (T->isObjCObjectPointerType())
	EltTy = T;
	else if (const MemberPointerType *PTy = T->getAs<MemberPointerType>())
	EltTy = PTy->getPointeeType();
	else
	EltTy = T->getPointeeType();

	// If we have a pointer or reference, the pointee must have an object
	// incomplete type.
	if (!EltTy->isIncompleteOrObjectType()) {
	DiagID = diag::err_typecheck_invalid_restrict_invalid_pointee;
	ProblemTy = EltTy;
	}
	} else if (!T->isDependentType()) {
	DiagID = diag::err_typecheck_invalid_restrict_not_pointer;
	ProblemTy = T;
	}

	if (DiagID) {
	Diag(DS ? DS->getRestrictSpecLoc() : Loc, DiagID) << ProblemTy;
	Qs.removeRestrict();
	}
	}

	return Context.getQualifiedType(T, Qs);
	}

	QualType Sema::BuildQualifiedType(QualType T, SourceLocation Loc,
	unsigned CVRAU, const DeclSpec *DS) {
	if (T.isNull())
	return QualType();

	// Ignore any attempt to form a cv-qualified reference.
	if (T->isReferenceType())
	CVRAU &=
	~(DeclSpec::TQ_const \| DeclSpec::TQ_volatile \| DeclSpec::TQ_atomic);

	// Convert from DeclSpec::TQ to Qualifiers::TQ by just dropping TQ_atomic and
	// TQ_unaligned;
	unsigned CVR = CVRAU & ~(DeclSpec::TQ_atomic \| DeclSpec::TQ_unaligned);

	// C11 6.7.3/5:
	// If the same qualifier appears more than once in the same
	// specifier-qualifier-list, either directly or via one or more typedefs,
	// the behavior is the same as if it appeared only once.
	//
	// It's not specified what happens when the _Atomic qualifier is applied to
	// a type specified with the _Atomic specifier, but we assume that this
	// should be treated as if the _Atomic qualifier appeared multiple times.
	if (CVRAU & DeclSpec::TQ_atomic && !T->isAtomicType()) {
	// C11 6.7.3/5:
	// If other qualifiers appear along with the _Atomic qualifier in a
	// specifier-qualifier-list, the resulting type is the so-qualified
	// atomic type.
	//
	// Don't need to worry about array types here, since _Atomic can't be
	// applied to such types.
	SplitQualType Split = T.getSplitUnqualifiedType();
	T = BuildAtomicType(QualType(Split.Ty, 0),
	DS ? DS->getAtomicSpecLoc() : Loc);
	if (T.isNull())
	return T;
	Split.Quals.addCVRQualifiers(CVR);
	return BuildQualifiedType(T, Loc, Split.Quals);
	}

	Qualifiers Q = Qualifiers::fromCVRMask(CVR);
	Q.setUnaligned(CVRAU & DeclSpec::TQ_unaligned);
	return BuildQualifiedType(T, Loc, Q, DS);
	}

	/// Build a paren type including \p T.
	QualType Sema::BuildParenType(QualType T) {
	return Context.getParenType(T);
	}

	/// Given that we're building a pointer or reference to the given
	static QualType inferARCLifetimeForPointee(Sema &S, QualType type,
	SourceLocation loc,
	bool isReference) {
	// Bail out if retention is unrequired or already specified.
	if (!type->isObjCLifetimeType() \|\|
	type.getObjCLifetime() != Qualifiers::OCL_None)
	return type;

	Qualifiers::ObjCLifetime implicitLifetime = Qualifiers::OCL_None;

	// If the object type is const-qualified, we can safely use
	// __unsafe_unretained. This is safe (because there are no read
	// barriers), and it'll be safe to coerce anything but __weak* to
	// the resulting type.
	if (type.isConstQualified()) {
	implicitLifetime = Qualifiers::OCL_ExplicitNone;

	// Otherwise, check whether the static type does not require
	// retaining. This currently only triggers for Class (possibly
	// protocol-qualifed, and arrays thereof).
	} else if (type->isObjCARCImplicitlyUnretainedType()) {
	implicitLifetime = Qualifiers::OCL_ExplicitNone;

	// If we are in an unevaluated context, like sizeof, skip adding a
	// qualification.
	} else if (S.isUnevaluatedContext()) {
	return type;

	// If that failed, give an error and recover using __strong. __strong
	// is the option most likely to prevent spurious second-order diagnostics,
	// like when binding a reference to a field.
	} else {
	// These types can show up in private ivars in system headers, so
	// we need this to not be an error in those cases. Instead we
	// want to delay.
	if (S.DelayedDiagnostics.shouldDelayDiagnostics()) {
	S.DelayedDiagnostics.add(
	sema::DelayedDiagnostic::makeForbiddenType(loc,
	diag::err_arc_indirect_no_ownership, type, isReference));
	} else {
	S.Diag(loc, diag::err_arc_indirect_no_ownership) << type << isReference;
	}
	implicitLifetime = Qualifiers::OCL_Strong;
	}
	assert(implicitLifetime && "didn't infer any lifetime!");

	Qualifiers qs;
	qs.addObjCLifetime(implicitLifetime);
	return S.Context.getQualifiedType(type, qs);
	}

	static std::string getFunctionQualifiersAsString(const FunctionProtoType *FnTy){
	std::string Quals = FnTy->getMethodQuals().getAsString();

	switch (FnTy->getRefQualifier()) {
	case RQ_None:
	break;

	case RQ_LValue:
	if (!Quals.empty())
	Quals += ' ';
	Quals += '&';
	break;

	case RQ_RValue:
	if (!Quals.empty())
	Quals += ' ';
	Quals += "&&";
	break;
	}

	return Quals;
	}

	namespace {
	/// Kinds of declarator that cannot contain a qualified function type.
	///
	/// C++98 [dcl.fct]p4 / C++11 [dcl.fct]p6:
	/// a function type with a cv-qualifier or a ref-qualifier can only appear
	/// at the topmost level of a type.
	///
	/// Parens and member pointers are permitted. We don't diagnose array and
	/// function declarators, because they don't allow function types at all.
	///
	/// The values of this enum are used in diagnostics.
	enum QualifiedFunctionKind { QFK_BlockPointer, QFK_Pointer, QFK_Reference };
	} // end anonymous namespace

	/// Check whether the type T is a qualified function type, and if it is,
	/// diagnose that it cannot be contained within the given kind of declarator.
	static bool checkQualifiedFunction(Sema &S, QualType T, SourceLocation Loc,
	QualifiedFunctionKind QFK) {
	// Does T refer to a function type with a cv-qualifier or a ref-qualifier?
	const FunctionProtoType *FPT = T->getAs<FunctionProtoType>();
	if (!FPT \|\| (FPT->getMethodQuals().empty() && FPT->getRefQualifier() == RQ_None))
	return false;

	S.Diag(Loc, diag::err_compound_qualified_function_type)
	<< QFK << isa<FunctionType>(T.IgnoreParens()) << T
	<< getFunctionQualifiersAsString(FPT);
	return true;
	}

	/// Build a pointer type.
	///
	/// \param T The type to which we'll be building a pointer.
	///
	/// \param Loc The location of the entity whose type involves this
	/// pointer type or, if there is no such entity, the location of the
	/// type that will have pointer type.
	///
	/// \param Entity The name of the entity that involves the pointer
	/// type, if known.
	///
	/// \returns A suitable pointer type, if there are no
	/// errors. Otherwise, returns a NULL type.
	QualType Sema::BuildPointerType(QualType T,
	SourceLocation Loc, DeclarationName Entity) {
	if (T->isReferenceType()) {
	// C++ 8.3.2p4: There shall be no ... pointers to references ...
	Diag(Loc, diag::err_illegal_decl_pointer_to_reference)
	<< getPrintableNameForEntity(Entity) << T;
	return QualType();
	}

	if (T->isFunctionType() && getLangOpts().OpenCL) {
	Diag(Loc, diag::err_opencl_function_pointer);
	return QualType();
	}

	if (checkQualifiedFunction(*this, T, Loc, QFK_Pointer))
	return QualType();

	assert(!T->isObjCObjectType() && "Should build ObjCObjectPointerType");

	// In ARC, it is forbidden to build pointers to unqualified pointers.
	if (getLangOpts().ObjCAutoRefCount)
	T = inferARCLifetimeForPointee(this, T, Loc, /reference*/ false);

	// Build the pointer type.
	return Context.getPointerType(T);
	}

	/// Build a reference type.
	///
	/// \param T The type to which we'll be building a reference.
	///
	/// \param Loc The location of the entity whose type involves this
	/// reference type or, if there is no such entity, the location of the
	/// type that will have reference type.
	///
	/// \param Entity The name of the entity that involves the reference
	/// type, if known.
	///
	/// \returns A suitable reference type, if there are no
	/// errors. Otherwise, returns a NULL type.
	QualType Sema::BuildReferenceType(QualType T, bool SpelledAsLValue,
	SourceLocation Loc,
	DeclarationName Entity) {
	assert(Context.getCanonicalType(T) != Context.OverloadTy &&
	"Unresolved overloaded function type");

	// C++0x [dcl.ref]p6:
	// If a typedef (7.1.3), a type template-parameter (14.3.1), or a
	// decltype-specifier (7.1.6.2) denotes a type TR that is a reference to a
	// type T, an attempt to create the type "lvalue reference to cv TR" creates
	// the type "lvalue reference to T", while an attempt to create the type
	// "rvalue reference to cv TR" creates the type TR.
	bool LValueRef = SpelledAsLValue \|\| T->getAs<LValueReferenceType>();

	// C++ [dcl.ref]p4: There shall be no references to references.
	//
	// According to C++ DR 106, references to references are only
	// diagnosed when they are written directly (e.g., "int & &"),
	// but not when they happen via a typedef:
	//
	// typedef int& intref;
	// typedef intref& intref2;
	//
	// Parser::ParseDeclaratorInternal diagnoses the case where
	// references are written directly; here, we handle the
	// collapsing of references-to-references as described in C++0x.
	// DR 106 and 540 introduce reference-collapsing into C++98/03.

	// C++ [dcl.ref]p1:
	// A declarator that specifies the type "reference to cv void"
	// is ill-formed.
	if (T->isVoidType()) {
	Diag(Loc, diag::err_reference_to_void);
	return QualType();
	}

	if (checkQualifiedFunction(*this, T, Loc, QFK_Reference))
	return QualType();

	// In ARC, it is forbidden to build references to unqualified pointers.
	if (getLangOpts().ObjCAutoRefCount)
	T = inferARCLifetimeForPointee(this, T, Loc, /reference*/ true);

	// Handle restrict on references.
	if (LValueRef)
	return Context.getLValueReferenceType(T, SpelledAsLValue);
	return Context.getRValueReferenceType(T);
	}

	/// Build a Read-only Pipe type.
	///
	/// \param T The type to which we'll be building a Pipe.
	///
	/// \param Loc We do not use it for now.
	///
	/// \returns A suitable pipe type, if there are no errors. Otherwise, returns a
	/// NULL type.
	QualType Sema::BuildReadPipeType(QualType T, SourceLocation Loc) {
	return Context.getReadPipeType(T);
	}

	/// Build a Write-only Pipe type.
	///
	/// \param T The type to which we'll be building a Pipe.
	///
	/// \param Loc We do not use it for now.
	///
	/// \returns A suitable pipe type, if there are no errors. Otherwise, returns a
	/// NULL type.
	QualType Sema::BuildWritePipeType(QualType T, SourceLocation Loc) {
	return Context.getWritePipeType(T);
	}

	/// Check whether the specified array size makes the array type a VLA. If so,
	/// return true, if not, return the size of the array in SizeVal.
	static bool isArraySizeVLA(Sema &S, Expr *ArraySize, llvm::APSInt &SizeVal) {
	// If the size is an ICE, it certainly isn't a VLA. If we're in a GNU mode
	// (like gnu99, but not c99) accept any evaluatable value as an extension.
	class VLADiagnoser : public Sema::VerifyICEDiagnoser {
	public:
	VLADiagnoser() : Sema::VerifyICEDiagnoser(true) {}

	void diagnoseNotICE(Sema &S, SourceLocation Loc, SourceRange SR) override {
	}

	void diagnoseFold(Sema &S, SourceLocation Loc, SourceRange SR) override {
	S.Diag(Loc, diag::ext_vla_folded_to_constant) << SR;
	}
	} Diagnoser;

	return S.VerifyIntegerConstantExpression(ArraySize, &SizeVal, Diagnoser,
	S.LangOpts.GNUMode \|\|
	S.LangOpts.OpenCL).isInvalid();
	}

	/// Build an array type.
	///
	/// \param T The type of each element in the array.
	///
	/// \param ASM C99 array size modifier (e.g., '*', 'static').
	///
	/// \param ArraySize Expression describing the size of the array.
	///
	/// \param Brackets The range from the opening '[' to the closing ']'.
	///
	/// \param Entity The name of the entity that involves the array
	/// type, if known.
	///
	/// \returns A suitable array type, if there are no errors. Otherwise,
	/// returns a NULL type.
	QualType Sema::BuildArrayType(QualType T, ArrayType::ArraySizeModifier ASM,
	Expr *ArraySize, unsigned Quals,
	SourceRange Brackets, DeclarationName Entity) {

	SourceLocation Loc = Brackets.getBegin();
	if (getLangOpts().CPlusPlus) {
	// C++ [dcl.array]p1:
	// T is called the array element type; this type shall not be a reference
	// type, the (possibly cv-qualified) type void, a function type or an
	// abstract class type.
	//
	// C++ [dcl.array]p3:
	// When several "array of" specifications are adjacent, [...] only the
	// first of the constant expressions that specify the bounds of the arrays
	// may be omitted.
	//
	// Note: function types are handled in the common path with C.
	if (T->isReferenceType()) {
	Diag(Loc, diag::err_illegal_decl_array_of_references)
	<< getPrintableNameForEntity(Entity) << T;
	return QualType();
	}

	if (T->isVoidType() \|\| T->isIncompleteArrayType()) {
	Diag(Loc, diag::err_illegal_decl_array_incomplete_type) << T;
	return QualType();
	}

	if (RequireNonAbstractType(Brackets.getBegin(), T,
	diag::err_array_of_abstract_type))
	return QualType();

	// Mentioning a member pointer type for an array type causes us to lock in
	// an inheritance model, even if it's inside an unused typedef.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft())
	if (const MemberPointerType *MPTy = T->getAs<MemberPointerType>())
	if (!MPTy->getClass()->isDependentType())
	(void)isCompleteType(Loc, T);

	} else {
	// C99 6.7.5.2p1: If the element type is an incomplete or function type,
	// reject it (e.g. void ary[7], struct foo ary[7], void ary[7]())
	if (RequireCompleteType(Loc, T,
	diag::err_illegal_decl_array_incomplete_type))
	return QualType();
	}

	if (T->isFunctionType()) {
	Diag(Loc, diag::err_illegal_decl_array_of_functions)
	<< getPrintableNameForEntity(Entity) << T;
	return QualType();
	}

	if (const RecordType *EltTy = T->getAs<RecordType>()) {
	// If the element type is a struct or union that contains a variadic
	// array, accept it as a GNU extension: C99 6.7.2.1p2.
	if (EltTy->getDecl()->hasFlexibleArrayMember())
	Diag(Loc, diag::ext_flexible_array_in_array) << T;
	} else if (T->isObjCObjectType()) {
	Diag(Loc, diag::err_objc_array_of_interfaces) << T;
	return QualType();
	}

	// Do placeholder conversions on the array size expression.
	if (ArraySize && ArraySize->hasPlaceholderType()) {
	ExprResult Result = CheckPlaceholderExpr(ArraySize);
	if (Result.isInvalid()) return QualType();
	ArraySize = Result.get();
	}

	// Do lvalue-to-rvalue conversions on the array size expression.
	if (ArraySize && !ArraySize->isRValue()) {
	ExprResult Result = DefaultLvalueConversion(ArraySize);
	if (Result.isInvalid())
	return QualType();

	ArraySize = Result.get();
	}

	// C99 6.7.5.2p1: The size expression shall have integer type.
	// C++11 allows contextual conversions to such types.
	if (!getLangOpts().CPlusPlus11 &&
	ArraySize && !ArraySize->isTypeDependent() &&
	!ArraySize->getType()->isIntegralOrUnscopedEnumerationType()) {
	Diag(ArraySize->getBeginLoc(), diag::err_array_size_non_int)
	<< ArraySize->getType() << ArraySize->getSourceRange();
	return QualType();
	}

	llvm::APSInt ConstVal(Context.getTypeSize(Context.getSizeType()));
	if (!ArraySize) {
	if (ASM == ArrayType::Star)
	T = Context.getVariableArrayType(T, nullptr, ASM, Quals, Brackets);
	else
	T = Context.getIncompleteArrayType(T, ASM, Quals);
	} else if (ArraySize->isTypeDependent() \|\| ArraySize->isValueDependent()) {
	T = Context.getDependentSizedArrayType(T, ArraySize, ASM, Quals, Brackets);
	} else if ((!T->isDependentType() && !T->isIncompleteType() &&
	!T->isConstantSizeType()) \|\|
	isArraySizeVLA(*this, ArraySize, ConstVal)) {
	// Even in C++11, don't allow contextual conversions in the array bound
	// of a VLA.
	if (getLangOpts().CPlusPlus11 &&
	!ArraySize->getType()->isIntegralOrUnscopedEnumerationType()) {
	Diag(ArraySize->getBeginLoc(), diag::err_array_size_non_int)
	<< ArraySize->getType() << ArraySize->getSourceRange();
	return QualType();
	}

	// C99: an array with an element type that has a non-constant-size is a VLA.
	// C99: an array with a non-ICE size is a VLA. We accept any expression
	// that we can fold to a non-zero positive value as an extension.
	T = Context.getVariableArrayType(T, ArraySize, ASM, Quals, Brackets);
	} else {
	// C99 6.7.5.2p1: If the expression is a constant expression, it shall
	// have a value greater than zero.
	if (ConstVal.isSigned() && ConstVal.isNegative()) {
	if (Entity)
	Diag(ArraySize->getBeginLoc(), diag::err_decl_negative_array_size)
	<< getPrintableNameForEntity(Entity) << ArraySize->getSourceRange();
	else
	Diag(ArraySize->getBeginLoc(), diag::err_typecheck_negative_array_size)
	<< ArraySize->getSourceRange();
	return QualType();
	}
	if (ConstVal == 0) {
	// GCC accepts zero sized static arrays. We allow them when
	// we're not in a SFINAE context.
	Diag(ArraySize->getBeginLoc(), isSFINAEContext()
	? diag::err_typecheck_zero_array_size
	: diag::ext_typecheck_zero_array_size)
	<< ArraySize->getSourceRange();

	if (ASM == ArrayType::Static) {
	Diag(ArraySize->getBeginLoc(),
	diag::warn_typecheck_zero_static_array_size)
	<< ArraySize->getSourceRange();
	ASM = ArrayType::Normal;
	}
	} else if (!T->isDependentType() && !T->isVariablyModifiedType() &&
	!T->isIncompleteType() && !T->isUndeducedType()) {
	// Is the array too large?
	unsigned ActiveSizeBits
	= ConstantArrayType::getNumAddressingBits(Context, T, ConstVal);
	if (ActiveSizeBits > ConstantArrayType::getMaxSizeBits(Context)) {
	Diag(ArraySize->getBeginLoc(), diag::err_array_too_large)
	<< ConstVal.toString(10) << ArraySize->getSourceRange();
	return QualType();
	}
	}

	T = Context.getConstantArrayType(T, ConstVal, ASM, Quals);
	}

	// OpenCL v1.2 s6.9.d: variable length arrays are not supported.
	if (getLangOpts().OpenCL && T->isVariableArrayType()) {
	Diag(Loc, diag::err_opencl_vla);
	return QualType();
	}

	if (T->isVariableArrayType() && !Context.getTargetInfo().isVLASupported()) {
	// CUDA device code and some other targets don't support VLAs.
	targetDiag(Loc, (getLangOpts().CUDA && getLangOpts().CUDAIsDevice)
	? diag::err_cuda_vla
	: diag::err_vla_unsupported)
	<< ((getLangOpts().CUDA && getLangOpts().CUDAIsDevice)
	? CurrentCUDATarget()
	: CFT_InvalidTarget);
	}

	// If this is not C99, extwarn about VLA's and C99 array size modifiers.
	if (!getLangOpts().C99) {
	if (T->isVariableArrayType()) {
	// Prohibit the use of VLAs during template argument deduction.
	if (isSFINAEContext()) {
	Diag(Loc, diag::err_vla_in_sfinae);
	return QualType();
	}
	// Just extwarn about VLAs.
	else
	Diag(Loc, diag::ext_vla);
	} else if (ASM != ArrayType::Normal \|\| Quals != 0)
	Diag(Loc,
	getLangOpts().CPlusPlus? diag::err_c99_array_usage_cxx
	: diag::ext_c99_array_usage) << ASM;
	}

	if (T->isVariableArrayType()) {
	// Warn about VLAs for -Wvla.
	Diag(Loc, diag::warn_vla_used);
	}

	// OpenCL v2.0 s6.12.5 - Arrays of blocks are not supported.
	// OpenCL v2.0 s6.16.13.1 - Arrays of pipe type are not supported.
	// OpenCL v2.0 s6.9.b - Arrays of image/sampler type are not supported.
	if (getLangOpts().OpenCL) {
	const QualType ArrType = Context.getBaseElementType(T);
	if (ArrType->isBlockPointerType() \|\| ArrType->isPipeType() \|\|
	ArrType->isSamplerT() \|\| ArrType->isImageType()) {
	Diag(Loc, diag::err_opencl_invalid_type_array) << ArrType;
	return QualType();
	}
	}

	return T;
	}

	QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
	SourceLocation AttrLoc) {
	// The base type must be integer (not Boolean or enumeration) or float, and
	// can't already be a vector.
	if (!CurType->isDependentType() &&
	(!CurType->isBuiltinType() \|\| CurType->isBooleanType() \|\|
	(!CurType->isIntegerType() && !CurType->isRealFloatingType()))) {
	Diag(AttrLoc, diag::err_attribute_invalid_vector_type) << CurType;
	return QualType();
	}

	if (SizeExpr->isTypeDependent() \|\| SizeExpr->isValueDependent())
	return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
	VectorType::GenericVector);

	llvm::APSInt VecSize(32);
	if (!SizeExpr->isIntegerConstantExpr(VecSize, Context)) {
	Diag(AttrLoc, diag::err_attribute_argument_type)
	<< "vector_size" << AANT_ArgumentIntegerConstant
	<< SizeExpr->getSourceRange();
	return QualType();
	}

	if (CurType->isDependentType())
	return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
	VectorType::GenericVector);

	unsigned VectorSize = static_cast<unsigned>(VecSize.getZExtValue() * 8);
	unsigned TypeSize = static_cast<unsigned>(Context.getTypeSize(CurType));

	if (VectorSize == 0) {
	Diag(AttrLoc, diag::err_attribute_zero_size) << SizeExpr->getSourceRange();
	return QualType();
	}

	// vecSize is specified in bytes - convert to bits.
	if (VectorSize % TypeSize) {
	Diag(AttrLoc, diag::err_attribute_invalid_size)
	<< SizeExpr->getSourceRange();
	return QualType();
	}

	if (VectorType::isVectorSizeTooLarge(VectorSize / TypeSize)) {
	Diag(AttrLoc, diag::err_attribute_size_too_large)
	<< SizeExpr->getSourceRange();
	return QualType();
	}

	return Context.getVectorType(CurType, VectorSize / TypeSize,
	VectorType::GenericVector);
	}

	/// Build an ext-vector type.
	///
	/// Run the required checks for the extended vector type.
	QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
	SourceLocation AttrLoc) {
	// Unlike gcc's vector_size attribute, we do not allow vectors to be defined
	// in conjunction with complex types (pointers, arrays, functions, etc.).
	//
	// Additionally, OpenCL prohibits vectors of booleans (they're considered a
	// reserved data type under OpenCL v2.0 s6.1.4), we don't support selects
	// on bitvectors, and we have no well-defined ABI for bitvectors, so vectors
	// of bool aren't allowed.
	if ((!T->isDependentType() && !T->isIntegerType() &&
	!T->isRealFloatingType()) \|\|
	T->isBooleanType()) {
	Diag(AttrLoc, diag::err_attribute_invalid_vector_type) << T;
	return QualType();
	}

	if (!ArraySize->isTypeDependent() && !ArraySize->isValueDependent()) {
	llvm::APSInt vecSize(32);
	if (!ArraySize->isIntegerConstantExpr(vecSize, Context)) {
	Diag(AttrLoc, diag::err_attribute_argument_type)
	<< "ext_vector_type" << AANT_ArgumentIntegerConstant
	<< ArraySize->getSourceRange();
	return QualType();
	}

	// Unlike gcc's vector_size attribute, the size is specified as the
	// number of elements, not the number of bytes.
	unsigned vectorSize = static_cast<unsigned>(vecSize.getZExtValue());

	if (vectorSize == 0) {
	Diag(AttrLoc, diag::err_attribute_zero_size)
	<< ArraySize->getSourceRange();
	return QualType();
	}

	if (VectorType::isVectorSizeTooLarge(vectorSize)) {
	Diag(AttrLoc, diag::err_attribute_size_too_large)
	<< ArraySize->getSourceRange();
	return QualType();
	}

	return Context.getExtVectorType(T, vectorSize);
	}

	return Context.getDependentSizedExtVectorType(T, ArraySize, AttrLoc);
	}

	bool Sema::CheckFunctionReturnType(QualType T, SourceLocation Loc) {
	if (T->isArrayType() \|\| T->isFunctionType()) {
	Diag(Loc, diag::err_func_returning_array_function)
	<< T->isFunctionType() << T;
	return true;
	}

	// Functions cannot return half FP.
	if (T->isHalfType() && !getLangOpts().HalfArgsAndReturns) {
	Diag(Loc, diag::err_parameters_retval_cannot_have_fp16_type) << 1 <<
	FixItHint::CreateInsertion(Loc, "*");
	return true;
	}

	// Methods cannot return interface types. All ObjC objects are
	// passed by reference.
	if (T->isObjCObjectType()) {
	Diag(Loc, diag::err_object_cannot_be_passed_returned_by_value)
	<< 0 << T << FixItHint::CreateInsertion(Loc, "*");
	return true;
	}

	if (T.hasNonTrivialToPrimitiveDestructCUnion() \|\|
	T.hasNonTrivialToPrimitiveCopyCUnion())
	checkNonTrivialCUnion(T, Loc, NTCUC_FunctionReturn,
	NTCUK_Destruct\|NTCUK_Copy);

	return false;
	}

	/// Check the extended parameter information. Most of the necessary
	/// checking should occur when applying the parameter attribute; the
	/// only other checks required are positional restrictions.
	static void checkExtParameterInfos(Sema &S, ArrayRef<QualType> paramTypes,
	const FunctionProtoType::ExtProtoInfo &EPI,
	llvm::function_ref<SourceLocation(unsigned)> getParamLoc) {
	assert(EPI.ExtParameterInfos && "shouldn't get here without param infos");

	bool hasCheckedSwiftCall = false;
	auto checkForSwiftCC = [&](unsigned paramIndex) {
	// Only do this once.
	if (hasCheckedSwiftCall) return;
	hasCheckedSwiftCall = true;
	if (EPI.ExtInfo.getCC() == CC_Swift) return;
	S.Diag(getParamLoc(paramIndex), diag::err_swift_param_attr_not_swiftcall)
	<< getParameterABISpelling(EPI.ExtParameterInfos[paramIndex].getABI());
	};

	for (size_t paramIndex = 0, numParams = paramTypes.size();
	paramIndex != numParams; ++paramIndex) {
	switch (EPI.ExtParameterInfos[paramIndex].getABI()) {
	// Nothing interesting to check for orindary-ABI parameters.
	case ParameterABI::Ordinary:
	continue;

	// swift_indirect_result parameters must be a prefix of the function
	// arguments.
	case ParameterABI::SwiftIndirectResult:
	checkForSwiftCC(paramIndex);
	if (paramIndex != 0 &&
	EPI.ExtParameterInfos[paramIndex - 1].getABI()
	!= ParameterABI::SwiftIndirectResult) {
	S.Diag(getParamLoc(paramIndex),
	diag::err_swift_indirect_result_not_first);
	}
	continue;

	case ParameterABI::SwiftContext:
	checkForSwiftCC(paramIndex);
	continue;

	// swift_error parameters must be preceded by a swift_context parameter.
	case ParameterABI::SwiftErrorResult:
	checkForSwiftCC(paramIndex);
	if (paramIndex == 0 \|\|
	EPI.ExtParameterInfos[paramIndex - 1].getABI() !=
	ParameterABI::SwiftContext) {
	S.Diag(getParamLoc(paramIndex),
	diag::err_swift_error_result_not_after_swift_context);
	}
	continue;
	}
	llvm_unreachable("bad ABI kind");
	}
	}

	QualType Sema::BuildFunctionType(QualType T,
	MutableArrayRef<QualType> ParamTypes,
	SourceLocation Loc, DeclarationName Entity,
	const FunctionProtoType::ExtProtoInfo &EPI) {
	bool Invalid = false;

	Invalid \|= CheckFunctionReturnType(T, Loc);

	for (unsigned Idx = 0, Cnt = ParamTypes.size(); Idx < Cnt; ++Idx) {
	// FIXME: Loc is too inprecise here, should use proper locations for args.
	QualType ParamType = Context.getAdjustedParameterType(ParamTypes[Idx]);
	if (ParamType->isVoidType()) {
	Diag(Loc, diag::err_param_with_void_type);
	Invalid = true;
	} else if (ParamType->isHalfType() && !getLangOpts().HalfArgsAndReturns) {
	// Disallow half FP arguments.
	Diag(Loc, diag::err_parameters_retval_cannot_have_fp16_type) << 0 <<
	FixItHint::CreateInsertion(Loc, "*");
	Invalid = true;
	}

	ParamTypes[Idx] = ParamType;
	}

	if (EPI.ExtParameterInfos) {
	checkExtParameterInfos(*this, ParamTypes, EPI,
	[=](unsigned i) { return Loc; });
	}

	if (EPI.ExtInfo.getProducesResult()) {
	// This is just a warning, so we can't fail to build if we see it.
	checkNSReturnsRetainedReturnType(Loc, T);
	}

	if (Invalid)
	return QualType();

	return Context.getFunctionType(T, ParamTypes, EPI);
	}

	/// Build a member pointer type \c T Class::*.
	///
	/// \param T the type to which the member pointer refers.
	/// \param Class the class type into which the member pointer points.
	/// \param Loc the location where this type begins
	/// \param Entity the name of the entity that will have this member pointer type
	///
	/// \returns a member pointer type, if successful, or a NULL type if there was
	/// an error.
	QualType Sema::BuildMemberPointerType(QualType T, QualType Class,
	SourceLocation Loc,
	DeclarationName Entity) {
	// Verify that we're not building a pointer to pointer to function with
	// exception specification.
	if (CheckDistantExceptionSpec(T)) {
	Diag(Loc, diag::err_distant_exception_spec);
	return QualType();
	}

	// C++ 8.3.3p3: A pointer to member shall not point to ... a member
	// with reference type, or "cv void."
	if (T->isReferenceType()) {
	Diag(Loc, diag::err_illegal_decl_mempointer_to_reference)
	<< getPrintableNameForEntity(Entity) << T;
	return QualType();
	}

	if (T->isVoidType()) {
	Diag(Loc, diag::err_illegal_decl_mempointer_to_void)
	<< getPrintableNameForEntity(Entity);
	return QualType();
	}

	if (!Class->isDependentType() && !Class->isRecordType()) {
	Diag(Loc, diag::err_mempointer_in_nonclass_type) << Class;
	return QualType();
	}

	// Adjust the default free function calling convention to the default method
	// calling convention.
	bool IsCtorOrDtor =
	(Entity.getNameKind() == DeclarationName::CXXConstructorName) \|\|
	(Entity.getNameKind() == DeclarationName::CXXDestructorName);
	if (T->isFunctionType())
	adjustMemberFunctionCC(T, /IsStatic=/false, IsCtorOrDtor, Loc);

	return Context.getMemberPointerType(T, Class.getTypePtr());
	}

	/// Build a block pointer type.
	///
	/// \param T The type to which we'll be building a block pointer.
	///
	/// \param Loc The source location, used for diagnostics.
	///
	/// \param Entity The name of the entity that involves the block pointer
	/// type, if known.
	///
	/// \returns A suitable block pointer type, if there are no
	/// errors. Otherwise, returns a NULL type.
	QualType Sema::BuildBlockPointerType(QualType T,
	SourceLocation Loc,
	DeclarationName Entity) {
	if (!T->isFunctionType()) {
	Diag(Loc, diag::err_nonfunction_block_type);
	return QualType();
	}

	if (checkQualifiedFunction(*this, T, Loc, QFK_BlockPointer))
	return QualType();

	return Context.getBlockPointerType(T);
	}

	QualType Sema::GetTypeFromParser(ParsedType Ty, TypeSourceInfo **TInfo) {
	QualType QT = Ty.get();
	if (QT.isNull()) {
	if (TInfo) *TInfo = nullptr;
	return QualType();
	}

	TypeSourceInfo *DI = nullptr;
	if (const LocInfoType *LIT = dyn_cast<LocInfoType>(QT)) {
	QT = LIT->getType();
	DI = LIT->getTypeSourceInfo();
	}

	if (TInfo) *TInfo = DI;
	return QT;
	}

	static void transferARCOwnershipToDeclaratorChunk(TypeProcessingState &state,
	Qualifiers::ObjCLifetime ownership,
	unsigned chunkIndex);

	/// Given that this is the declaration of a parameter under ARC,
	/// attempt to infer attributes and such for pointer-to-whatever
	/// types.
	static void inferARCWriteback(TypeProcessingState &state,
	QualType &declSpecType) {
	Sema &S = state.getSema();
	Declarator &declarator = state.getDeclarator();

	// TODO: should we care about decl qualifiers?

	// Check whether the declarator has the expected form. We walk
	// from the inside out in order to make the block logic work.
	unsigned outermostPointerIndex = 0;
	bool isBlockPointer = false;
	unsigned numPointers = 0;
	for (unsigned i = 0, e = declarator.getNumTypeObjects(); i != e; ++i) {
	unsigned chunkIndex = i;
	DeclaratorChunk &chunk = declarator.getTypeObject(chunkIndex);
	switch (chunk.Kind) {
	case DeclaratorChunk::Paren:
	// Ignore parens.
	break;

	case DeclaratorChunk::Reference:
	case DeclaratorChunk::Pointer:
	// Count the number of pointers. Treat references
	// interchangeably as pointers; if they're mis-ordered, normal
	// type building will discover that.
	outermostPointerIndex = chunkIndex;
	numPointers++;
	break;

	case DeclaratorChunk::BlockPointer:
	// If we have a pointer to block pointer, that's an acceptable
	// indirect reference; anything else is not an application of
	// the rules.
	if (numPointers != 1) return;
	numPointers++;
	outermostPointerIndex = chunkIndex;
	isBlockPointer = true;

	// We don't care about pointer structure in return values here.
	goto done;

	case DeclaratorChunk::Array: // suppress if written (id[])?
	case DeclaratorChunk::Function:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pipe:
	return;
	}
	}
	done:

	// If we have one pointer, then we want to throw the qualifier on
	// the declaration-specifiers, which means that it needs to be a
	// retainable object type.
	if (numPointers == 1) {
	// If it's not a retainable object type, the rule doesn't apply.
	if (!declSpecType->isObjCRetainableType()) return;

	// If it already has lifetime, don't do anything.
	if (declSpecType.getObjCLifetime()) return;

	// Otherwise, modify the type in-place.
	Qualifiers qs;

	if (declSpecType->isObjCARCImplicitlyUnretainedType())
	qs.addObjCLifetime(Qualifiers::OCL_ExplicitNone);
	else
	qs.addObjCLifetime(Qualifiers::OCL_Autoreleasing);
	declSpecType = S.Context.getQualifiedType(declSpecType, qs);

	// If we have two pointers, then we want to throw the qualifier on
	// the outermost pointer.
	} else if (numPointers == 2) {
	// If we don't have a block pointer, we need to check whether the
	// declaration-specifiers gave us something that will turn into a
	// retainable object pointer after we slap the first pointer on it.
	if (!isBlockPointer && !declSpecType->isObjCObjectType())
	return;

	// Look for an explicit lifetime attribute there.
	DeclaratorChunk &chunk = declarator.getTypeObject(outermostPointerIndex);
	if (chunk.Kind != DeclaratorChunk::Pointer &&
	chunk.Kind != DeclaratorChunk::BlockPointer)
	return;
	for (const ParsedAttr &AL : chunk.getAttrs())
	if (AL.getKind() == ParsedAttr::AT_ObjCOwnership)
	return;

	transferARCOwnershipToDeclaratorChunk(state, Qualifiers::OCL_Autoreleasing,
	outermostPointerIndex);

	// Any other number of pointers/references does not trigger the rule.
	} else return;

	// TODO: mark whether we did this inference?
	}

	void Sema::diagnoseIgnoredQualifiers(unsigned DiagID, unsigned Quals,
	SourceLocation FallbackLoc,
	SourceLocation ConstQualLoc,
	SourceLocation VolatileQualLoc,
	SourceLocation RestrictQualLoc,
	SourceLocation AtomicQualLoc,
	SourceLocation UnalignedQualLoc) {
	if (!Quals)
	return;

	struct Qual {
	const char *Name;
	unsigned Mask;
	SourceLocation Loc;
	} const QualKinds[5] = {
	{ "const", DeclSpec::TQ_const, ConstQualLoc },
	{ "volatile", DeclSpec::TQ_volatile, VolatileQualLoc },
	{ "restrict", DeclSpec::TQ_restrict, RestrictQualLoc },
	{ "__unaligned", DeclSpec::TQ_unaligned, UnalignedQualLoc },
	{ "_Atomic", DeclSpec::TQ_atomic, AtomicQualLoc }
	};

	SmallString<32> QualStr;
	unsigned NumQuals = 0;
	SourceLocation Loc;
	FixItHint FixIts[5];

	// Build a string naming the redundant qualifiers.
	for (auto &E : QualKinds) {
	if (Quals & E.Mask) {
	if (!QualStr.empty()) QualStr += ' ';
	QualStr += E.Name;

	// If we have a location for the qualifier, offer a fixit.
	SourceLocation QualLoc = E.Loc;
	if (QualLoc.isValid()) {
	FixIts[NumQuals] = FixItHint::CreateRemoval(QualLoc);
	if (Loc.isInvalid() \|\|
	getSourceManager().isBeforeInTranslationUnit(QualLoc, Loc))
	Loc = QualLoc;
	}

	++NumQuals;
	}
	}

	Diag(Loc.isInvalid() ? FallbackLoc : Loc, DiagID)
	<< QualStr << NumQuals << FixIts[0] << FixIts[1] << FixIts[2] << FixIts[3];
	}

	// Diagnose pointless type qualifiers on the return type of a function.
	static void diagnoseRedundantReturnTypeQualifiers(Sema &S, QualType RetTy,
	Declarator &D,
	unsigned FunctionChunkIndex) {
	if (D.getTypeObject(FunctionChunkIndex).Fun.hasTrailingReturnType()) {
	// FIXME: TypeSourceInfo doesn't preserve location information for
	// qualifiers.
	S.diagnoseIgnoredQualifiers(diag::warn_qual_return_type,
	RetTy.getLocalCVRQualifiers(),
	D.getIdentifierLoc());
	return;
	}

	for (unsigned OuterChunkIndex = FunctionChunkIndex + 1,
	End = D.getNumTypeObjects();
	OuterChunkIndex != End; ++OuterChunkIndex) {
	DeclaratorChunk &OuterChunk = D.getTypeObject(OuterChunkIndex);
	switch (OuterChunk.Kind) {
	case DeclaratorChunk::Paren:
	continue;

	case DeclaratorChunk::Pointer: {
	DeclaratorChunk::PointerTypeInfo &PTI = OuterChunk.Ptr;
	S.diagnoseIgnoredQualifiers(
	diag::warn_qual_return_type,
	PTI.TypeQuals,
	SourceLocation(),
	SourceLocation::getFromRawEncoding(PTI.ConstQualLoc),
	SourceLocation::getFromRawEncoding(PTI.VolatileQualLoc),
	SourceLocation::getFromRawEncoding(PTI.RestrictQualLoc),
	SourceLocation::getFromRawEncoding(PTI.AtomicQualLoc),
	SourceLocation::getFromRawEncoding(PTI.UnalignedQualLoc));
	return;
	}

	case DeclaratorChunk::Function:
	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::Array:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pipe:
	// FIXME: We can't currently provide an accurate source location and a
	// fix-it hint for these.
	unsigned AtomicQual = RetTy->isAtomicType() ? DeclSpec::TQ_atomic : 0;
	S.diagnoseIgnoredQualifiers(diag::warn_qual_return_type,
	RetTy.getCVRQualifiers() \| AtomicQual,
	D.getIdentifierLoc());
	return;
	}

	llvm_unreachable("unknown declarator chunk kind");
	}

	// If the qualifiers come from a conversion function type, don't diagnose
	// them -- they're not necessarily redundant, since such a conversion
	// operator can be explicitly called as "x.operator const int()".
	if (D.getName().getKind() == UnqualifiedIdKind::IK_ConversionFunctionId)
	return;

	// Just parens all the way out to the decl specifiers. Diagnose any qualifiers
	// which are present there.
	S.diagnoseIgnoredQualifiers(diag::warn_qual_return_type,
	D.getDeclSpec().getTypeQualifiers(),
	D.getIdentifierLoc(),
	D.getDeclSpec().getConstSpecLoc(),
	D.getDeclSpec().getVolatileSpecLoc(),
	D.getDeclSpec().getRestrictSpecLoc(),
	D.getDeclSpec().getAtomicSpecLoc(),
	D.getDeclSpec().getUnalignedSpecLoc());
	}

	static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
	TypeSourceInfo *&ReturnTypeInfo) {
	Sema &SemaRef = state.getSema();
	Declarator &D = state.getDeclarator();
	QualType T;
	ReturnTypeInfo = nullptr;

	// The TagDecl owned by the DeclSpec.
	TagDecl *OwnedTagDecl = nullptr;

	switch (D.getName().getKind()) {
	case UnqualifiedIdKind::IK_ImplicitSelfParam:
	case UnqualifiedIdKind::IK_OperatorFunctionId:
	case UnqualifiedIdKind::IK_Identifier:
	case UnqualifiedIdKind::IK_LiteralOperatorId:
	case UnqualifiedIdKind::IK_TemplateId:
	T = ConvertDeclSpecToType(state);

	if (!D.isInvalidType() && D.getDeclSpec().isTypeSpecOwned()) {
	OwnedTagDecl = cast<TagDecl>(D.getDeclSpec().getRepAsDecl());
	// Owned declaration is embedded in declarator.
	OwnedTagDecl->setEmbeddedInDeclarator(true);
	}
	break;

	case UnqualifiedIdKind::IK_ConstructorName:
	case UnqualifiedIdKind::IK_ConstructorTemplateId:
	case UnqualifiedIdKind::IK_DestructorName:
	// Constructors and destructors don't have return types. Use
	// "void" instead.
	T = SemaRef.Context.VoidTy;
	processTypeAttrs(state, T, TAL_DeclSpec,
	D.getMutableDeclSpec().getAttributes());
	break;

	case UnqualifiedIdKind::IK_DeductionGuideName:
	// Deduction guides have a trailing return type and no type in their
	// decl-specifier sequence. Use a placeholder return type for now.
	T = SemaRef.Context.DependentTy;
	break;

	case UnqualifiedIdKind::IK_ConversionFunctionId:
	// The result type of a conversion function is the type that it
	// converts to.
	T = SemaRef.GetTypeFromParser(D.getName().ConversionFunctionId,
	&ReturnTypeInfo);
	break;
	}

	if (!D.getAttributes().empty())
	distributeTypeAttrsFromDeclarator(state, T);

	// C++11 [dcl.spec.auto]p5: reject 'auto' if it is not in an allowed context.
	if (DeducedType *Deduced = T->getContainedDeducedType()) {
	AutoType *Auto = dyn_cast<AutoType>(Deduced);
	int Error = -1;

	// Is this a 'auto' or 'decltype(auto)' type (as opposed to __auto_type or
	// class template argument deduction)?
	bool IsCXXAutoType =
	(Auto && Auto->getKeyword() != AutoTypeKeyword::GNUAutoType);
	bool IsDeducedReturnType = false;

	switch (D.getContext()) {
	case DeclaratorContext::LambdaExprContext:
	// Declared return type of a lambda-declarator is implicit and is always
	// 'auto'.
	break;
	case DeclaratorContext::ObjCParameterContext:
	case DeclaratorContext::ObjCResultContext:
	case DeclaratorContext::PrototypeContext:
	Error = 0;
	break;
	case DeclaratorContext::LambdaExprParameterContext:
	// In C++14, generic lambdas allow 'auto' in their parameters.
	if (!SemaRef.getLangOpts().CPlusPlus14 \|\|
	!Auto \|\| Auto->getKeyword() != AutoTypeKeyword::Auto)
	Error = 16;
	else {
	// If auto is mentioned in a lambda parameter context, convert it to a
	// template parameter type.
	sema::LambdaScopeInfo *LSI = SemaRef.getCurLambda();
	assert(LSI && "No LambdaScopeInfo on the stack!");
	const unsigned TemplateParameterDepth = LSI->AutoTemplateParameterDepth;
	const unsigned AutoParameterPosition = LSI->TemplateParams.size();
	const bool IsParameterPack = D.hasEllipsis();

	// Create the TemplateTypeParmDecl here to retrieve the corresponding
	// template parameter type. Template parameters are temporarily added
	// to the TU until the associated TemplateDecl is created.
	TemplateTypeParmDecl *CorrespondingTemplateParam =
	TemplateTypeParmDecl::Create(
	SemaRef.Context, SemaRef.Context.getTranslationUnitDecl(),
	/KeyLoc/ SourceLocation(), /NameLoc/ D.getBeginLoc(),
	TemplateParameterDepth, AutoParameterPosition,
	/Identifier/ nullptr, false, IsParameterPack);
	CorrespondingTemplateParam->setImplicit();
	LSI->TemplateParams.push_back(CorrespondingTemplateParam);
	// Replace the 'auto' in the function parameter with this invented
	// template type parameter.
	// FIXME: Retain some type sugar to indicate that this was written
	// as 'auto'.
	T = state.ReplaceAutoType(
	T, QualType(CorrespondingTemplateParam->getTypeForDecl(), 0));
	}
	break;
	case DeclaratorContext::MemberContext: {
	if (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_static \|\|
	D.isFunctionDeclarator())
	break;
	bool Cxx = SemaRef.getLangOpts().CPlusPlus;
	switch (cast<TagDecl>(SemaRef.CurContext)->getTagKind()) {
	case TTK_Enum: llvm_unreachable("unhandled tag kind");
	case TTK_Struct: Error = Cxx ? 1 : 2; /* Struct member */ break;
	case TTK_Union: Error = Cxx ? 3 : 4; /* Union member */ break;
	case TTK_Class: Error = 5; /* Class member */ break;
	case TTK_Interface: Error = 6; /* Interface member */ break;
	}
	if (D.getDeclSpec().isFriendSpecified())
	Error = 20; // Friend type
	break;
	}
	case DeclaratorContext::CXXCatchContext:
	case DeclaratorContext::ObjCCatchContext:
	Error = 7; // Exception declaration
	break;
	case DeclaratorContext::TemplateParamContext:
	if (isa<DeducedTemplateSpecializationType>(Deduced))
	Error = 19; // Template parameter
	else if (!SemaRef.getLangOpts().CPlusPlus17)
	Error = 8; // Template parameter (until C++17)
	break;
	case DeclaratorContext::BlockLiteralContext:
	Error = 9; // Block literal
	break;
	case DeclaratorContext::TemplateArgContext:
	// Within a template argument list, a deduced template specialization
	// type will be reinterpreted as a template template argument.
	if (isa<DeducedTemplateSpecializationType>(Deduced) &&
	!D.getNumTypeObjects() &&
	D.getDeclSpec().getParsedSpecifiers() == DeclSpec::PQ_TypeSpecifier)
	break;
	LLVM_FALLTHROUGH;
	case DeclaratorContext::TemplateTypeArgContext:
	Error = 10; // Template type argument
	break;
	case DeclaratorContext::AliasDeclContext:
	case DeclaratorContext::AliasTemplateContext:
	Error = 12; // Type alias
	break;
	case DeclaratorContext::TrailingReturnContext:
	case DeclaratorContext::TrailingReturnVarContext:
	if (!SemaRef.getLangOpts().CPlusPlus14 \|\| !IsCXXAutoType)
	Error = 13; // Function return type
	IsDeducedReturnType = true;
	break;
	case DeclaratorContext::ConversionIdContext:
	if (!SemaRef.getLangOpts().CPlusPlus14 \|\| !IsCXXAutoType)
	Error = 14; // conversion-type-id
	IsDeducedReturnType = true;
	break;
	case DeclaratorContext::FunctionalCastContext:
	if (isa<DeducedTemplateSpecializationType>(Deduced))
	break;
	LLVM_FALLTHROUGH;
	case DeclaratorContext::TypeNameContext:
	Error = 15; // Generic
	break;
	case DeclaratorContext::FileContext:
	case DeclaratorContext::BlockContext:
	case DeclaratorContext::ForContext:
	case DeclaratorContext::InitStmtContext:
	case DeclaratorContext::ConditionContext:
	// FIXME: P0091R3 (erroneously) does not permit class template argument
	// deduction in conditions, for-init-statements, and other declarations
	// that are not simple-declarations.
	break;
	case DeclaratorContext::CXXNewContext:
	// FIXME: P0091R3 does not permit class template argument deduction here,
	// but we follow GCC and allow it anyway.
	if (!IsCXXAutoType && !isa<DeducedTemplateSpecializationType>(Deduced))
	Error = 17; // 'new' type
	break;
	case DeclaratorContext::KNRTypeListContext:
	Error = 18; // K&R function parameter
	break;
	}

	if (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_typedef)
	Error = 11;

	// In Objective-C it is an error to use 'auto' on a function declarator
	// (and everywhere for '__auto_type').
	if (D.isFunctionDeclarator() &&
	(!SemaRef.getLangOpts().CPlusPlus11 \|\| !IsCXXAutoType))
	Error = 13;

	bool HaveTrailing = false;

	// C++11 [dcl.spec.auto]p2: 'auto' is always fine if the declarator
	// contains a trailing return type. That is only legal at the outermost
	// level. Check all declarator chunks (outermost first) anyway, to give
	// better diagnostics.
	// We don't support '__auto_type' with trailing return types.
	// FIXME: Should we only do this for 'auto' and not 'decltype(auto)'?
	if (SemaRef.getLangOpts().CPlusPlus11 && IsCXXAutoType &&
	D.hasTrailingReturnType()) {
	HaveTrailing = true;
	Error = -1;
	}

	SourceRange AutoRange = D.getDeclSpec().getTypeSpecTypeLoc();
	if (D.getName().getKind() == UnqualifiedIdKind::IK_ConversionFunctionId)
	AutoRange = D.getName().getSourceRange();

	if (Error != -1) {
	unsigned Kind;
	if (Auto) {
	switch (Auto->getKeyword()) {
	case AutoTypeKeyword::Auto: Kind = 0; break;
	case AutoTypeKeyword::DecltypeAuto: Kind = 1; break;
	case AutoTypeKeyword::GNUAutoType: Kind = 2; break;
	}
	} else {
	assert(isa<DeducedTemplateSpecializationType>(Deduced) &&
	"unknown auto type");
	Kind = 3;
	}

	auto *DTST = dyn_cast<DeducedTemplateSpecializationType>(Deduced);
	TemplateName TN = DTST ? DTST->getTemplateName() : TemplateName();

	SemaRef.Diag(AutoRange.getBegin(), diag::err_auto_not_allowed)
	<< Kind << Error << (int)SemaRef.getTemplateNameKindForDiagnostics(TN)
	<< QualType(Deduced, 0) << AutoRange;
	if (auto *TD = TN.getAsTemplateDecl())
	SemaRef.Diag(TD->getLocation(), diag::note_template_decl_here);

	T = SemaRef.Context.IntTy;
	D.setInvalidType(true);
	} else if (!HaveTrailing &&
	D.getContext() != DeclaratorContext::LambdaExprContext) {
	// If there was a trailing return type, we already got
	// warn_cxx98_compat_trailing_return_type in the parser.
	SemaRef.Diag(AutoRange.getBegin(),
	D.getContext() ==
	DeclaratorContext::LambdaExprParameterContext
	? diag::warn_cxx11_compat_generic_lambda
	: IsDeducedReturnType
	? diag::warn_cxx11_compat_deduced_return_type
	: diag::warn_cxx98_compat_auto_type_specifier)
	<< AutoRange;
	}
	}

	if (SemaRef.getLangOpts().CPlusPlus &&
	OwnedTagDecl && OwnedTagDecl->isCompleteDefinition()) {
	// Check the contexts where C++ forbids the declaration of a new class
	// or enumeration in a type-specifier-seq.
	unsigned DiagID = 0;
	switch (D.getContext()) {
	case DeclaratorContext::TrailingReturnContext:
	case DeclaratorContext::TrailingReturnVarContext:
	// Class and enumeration definitions are syntactically not allowed in
	// trailing return types.
	llvm_unreachable("parser should not have allowed this");
	break;
	case DeclaratorContext::FileContext:
	case DeclaratorContext::MemberContext:
	case DeclaratorContext::BlockContext:
	case DeclaratorContext::ForContext:
	case DeclaratorContext::InitStmtContext:
	case DeclaratorContext::BlockLiteralContext:
	case DeclaratorContext::LambdaExprContext:
	// C++11 [dcl.type]p3:
	// A type-specifier-seq shall not define a class or enumeration unless
	// it appears in the type-id of an alias-declaration (7.1.3) that is not
	// the declaration of a template-declaration.
	case DeclaratorContext::AliasDeclContext:
	break;
	case DeclaratorContext::AliasTemplateContext:
	DiagID = diag::err_type_defined_in_alias_template;
	break;
	case DeclaratorContext::TypeNameContext:
	case DeclaratorContext::FunctionalCastContext:
	case DeclaratorContext::ConversionIdContext:
	case DeclaratorContext::TemplateParamContext:
	case DeclaratorContext::CXXNewContext:
	case DeclaratorContext::CXXCatchContext:
	case DeclaratorContext::ObjCCatchContext:
	case DeclaratorContext::TemplateArgContext:
	case DeclaratorContext::TemplateTypeArgContext:
	DiagID = diag::err_type_defined_in_type_specifier;
	break;
	case DeclaratorContext::PrototypeContext:
	case DeclaratorContext::LambdaExprParameterContext:
	case DeclaratorContext::ObjCParameterContext:
	case DeclaratorContext::ObjCResultContext:
	case DeclaratorContext::KNRTypeListContext:
	// C++ [dcl.fct]p6:
	// Types shall not be defined in return or parameter types.
	DiagID = diag::err_type_defined_in_param_type;
	break;
	case DeclaratorContext::ConditionContext:
	// C++ 6.4p2:
	// The type-specifier-seq shall not contain typedef and shall not declare
	// a new class or enumeration.
	DiagID = diag::err_type_defined_in_condition;
	break;
	}

	if (DiagID != 0) {
	SemaRef.Diag(OwnedTagDecl->getLocation(), DiagID)
	<< SemaRef.Context.getTypeDeclType(OwnedTagDecl);
	D.setInvalidType(true);
	}
	}

	assert(!T.isNull() && "This function should not return a null type");
	return T;
	}

	/// Produce an appropriate diagnostic for an ambiguity between a function
	/// declarator and a C++ direct-initializer.
	static void warnAboutAmbiguousFunction(Sema &S, Declarator &D,
	DeclaratorChunk &DeclType, QualType RT) {
	const DeclaratorChunk::FunctionTypeInfo &FTI = DeclType.Fun;
	assert(FTI.isAmbiguous && "no direct-initializer / function ambiguity");

	// If the return type is void there is no ambiguity.
	if (RT->isVoidType())
	return;

	// An initializer for a non-class type can have at most one argument.
	if (!RT->isRecordType() && FTI.NumParams > 1)
	return;

	// An initializer for a reference must have exactly one argument.
	if (RT->isReferenceType() && FTI.NumParams != 1)
	return;

	// Only warn if this declarator is declaring a function at block scope, and
	// doesn't have a storage class (such as 'extern') specified.
	if (!D.isFunctionDeclarator() \|\|
	D.getFunctionDefinitionKind() != FDK_Declaration \|\|
	!S.CurContext->isFunctionOrMethod() \|\|
	D.getDeclSpec().getStorageClassSpec()
	!= DeclSpec::SCS_unspecified)
	return;

	// Inside a condition, a direct initializer is not permitted. We allow one to
	// be parsed in order to give better diagnostics in condition parsing.
	if (D.getContext() == DeclaratorContext::ConditionContext)
	return;

	SourceRange ParenRange(DeclType.Loc, DeclType.EndLoc);

	S.Diag(DeclType.Loc,
	FTI.NumParams ? diag::warn_parens_disambiguated_as_function_declaration
	: diag::warn_empty_parens_are_function_decl)
	<< ParenRange;

	// If the declaration looks like:
	// T var1,
	// f();
	// and name lookup finds a function named 'f', then the ',' was
	// probably intended to be a ';'.
	if (!D.isFirstDeclarator() && D.getIdentifier()) {
	FullSourceLoc Comma(D.getCommaLoc(), S.SourceMgr);
	FullSourceLoc Name(D.getIdentifierLoc(), S.SourceMgr);
	if (Comma.getFileID() != Name.getFileID() \|\|
	Comma.getSpellingLineNumber() != Name.getSpellingLineNumber()) {
	LookupResult Result(S, D.getIdentifier(), SourceLocation(),
	Sema::LookupOrdinaryName);
	if (S.LookupName(Result, S.getCurScope()))
	S.Diag(D.getCommaLoc(), diag::note_empty_parens_function_call)
	<< FixItHint::CreateReplacement(D.getCommaLoc(), ";")
	<< D.getIdentifier();
	Result.suppressDiagnostics();
	}
	}

	if (FTI.NumParams > 0) {
	// For a declaration with parameters, eg. "T var(T());", suggest adding
	// parens around the first parameter to turn the declaration into a
	// variable declaration.
	SourceRange Range = FTI.Params[0].Param->getSourceRange();
	SourceLocation B = Range.getBegin();
	SourceLocation E = S.getLocForEndOfToken(Range.getEnd());
	// FIXME: Maybe we should suggest adding braces instead of parens
	// in C++11 for classes that don't have an initializer_list constructor.
	S.Diag(B, diag::note_additional_parens_for_variable_declaration)
	<< FixItHint::CreateInsertion(B, "(")
	<< FixItHint::CreateInsertion(E, ")");
	} else {
	// For a declaration without parameters, eg. "T var();", suggest replacing
	// the parens with an initializer to turn the declaration into a variable
	// declaration.
	const CXXRecordDecl *RD = RT->getAsCXXRecordDecl();

	// Empty parens mean value-initialization, and no parens mean
	// default initialization. These are equivalent if the default
	// constructor is user-provided or if zero-initialization is a
	// no-op.
	if (RD && RD->hasDefinition() &&
	(RD->isEmpty() \|\| RD->hasUserProvidedDefaultConstructor()))
	S.Diag(DeclType.Loc, diag::note_empty_parens_default_ctor)
	<< FixItHint::CreateRemoval(ParenRange);
	else {
	std::string Init =
	S.getFixItZeroInitializerForType(RT, ParenRange.getBegin());
	if (Init.empty() && S.LangOpts.CPlusPlus11)
	Init = "{}";
	if (!Init.empty())
	S.Diag(DeclType.Loc, diag::note_empty_parens_zero_initialize)
	<< FixItHint::CreateReplacement(ParenRange, Init);
	}
	}
	}

	/// Produce an appropriate diagnostic for a declarator with top-level
	/// parentheses.
	static void warnAboutRedundantParens(Sema &S, Declarator &D, QualType T) {
	DeclaratorChunk &Paren = D.getTypeObject(D.getNumTypeObjects() - 1);
	assert(Paren.Kind == DeclaratorChunk::Paren &&
	"do not have redundant top-level parentheses");

	// This is a syntactic check; we're not interested in cases that arise
	// during template instantiation.
	if (S.inTemplateInstantiation())
	return;

	// Check whether this could be intended to be a construction of a temporary
	// object in C++ via a function-style cast.
	bool CouldBeTemporaryObject =
	S.getLangOpts().CPlusPlus && D.isExpressionContext() &&
	!D.isInvalidType() && D.getIdentifier() &&
	D.getDeclSpec().getParsedSpecifiers() == DeclSpec::PQ_TypeSpecifier &&
	(T->isRecordType() \|\| T->isDependentType()) &&
	D.getDeclSpec().getTypeQualifiers() == 0 && D.isFirstDeclarator();

	bool StartsWithDeclaratorId = true;
	for (auto &C : D.type_objects()) {
	switch (C.Kind) {
	case DeclaratorChunk::Paren:
	if (&C == &Paren)
	continue;
	LLVM_FALLTHROUGH;
	case DeclaratorChunk::Pointer:
	StartsWithDeclaratorId = false;
	continue;

	case DeclaratorChunk::Array:
	if (!C.Arr.NumElts)
	CouldBeTemporaryObject = false;
	continue;

	case DeclaratorChunk::Reference:
	// FIXME: Suppress the warning here if there is no initializer; we're
	// going to give an error anyway.
	// We assume that something like 'T (&x) = y;' is highly likely to not
	// be intended to be a temporary object.
	CouldBeTemporaryObject = false;
	StartsWithDeclaratorId = false;
	continue;

	case DeclaratorChunk::Function:
	// In a new-type-id, function chunks require parentheses.
	if (D.getContext() == DeclaratorContext::CXXNewContext)
	return;
	// FIXME: "A(f())" deserves a vexing-parse warning, not just a
	// redundant-parens warning, but we don't know whether the function
	// chunk was syntactically valid as an expression here.
	CouldBeTemporaryObject = false;
	continue;

	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pipe:
	// These cannot appear in expressions.
	CouldBeTemporaryObject = false;
	StartsWithDeclaratorId = false;
	continue;
	}
	}

	// FIXME: If there is an initializer, assume that this is not intended to be
	// a construction of a temporary object.

	// Check whether the name has already been declared; if not, this is not a
	// function-style cast.
	if (CouldBeTemporaryObject) {
	LookupResult Result(S, D.getIdentifier(), SourceLocation(),
	Sema::LookupOrdinaryName);
	if (!S.LookupName(Result, S.getCurScope()))
	CouldBeTemporaryObject = false;
	Result.suppressDiagnostics();
	}

	SourceRange ParenRange(Paren.Loc, Paren.EndLoc);

	if (!CouldBeTemporaryObject) {
	// If we have A (::B), the parentheses affect the meaning of the program.
	// Suppress the warning in that case. Don't bother looking at the DeclSpec
	// here: even (e.g.) "int ::x" is visually ambiguous even though it's
	// formally unambiguous.
	if (StartsWithDeclaratorId && D.getCXXScopeSpec().isValid()) {
	for (NestedNameSpecifier *NNS = D.getCXXScopeSpec().getScopeRep(); NNS;
	NNS = NNS->getPrefix()) {
	if (NNS->getKind() == NestedNameSpecifier::Global)
	return;
	}
	}

	S.Diag(Paren.Loc, diag::warn_redundant_parens_around_declarator)
	<< ParenRange << FixItHint::CreateRemoval(Paren.Loc)
	<< FixItHint::CreateRemoval(Paren.EndLoc);
	return;
	}

	S.Diag(Paren.Loc, diag::warn_parens_disambiguated_as_variable_declaration)
	<< ParenRange << D.getIdentifier();
	auto *RD = T->getAsCXXRecordDecl();
	if (!RD \|\| !RD->hasDefinition() \|\| RD->hasNonTrivialDestructor())
	S.Diag(Paren.Loc, diag::note_raii_guard_add_name)
	<< FixItHint::CreateInsertion(Paren.Loc, " varname") << T
	<< D.getIdentifier();
	// FIXME: A cast to void is probably a better suggestion in cases where it's
	// valid (when there is no initializer and we're not in a condition).
	S.Diag(D.getBeginLoc(), diag::note_function_style_cast_add_parentheses)
	<< FixItHint::CreateInsertion(D.getBeginLoc(), "(")
	<< FixItHint::CreateInsertion(S.getLocForEndOfToken(D.getEndLoc()), ")");
	S.Diag(Paren.Loc, diag::note_remove_parens_for_variable_declaration)
	<< FixItHint::CreateRemoval(Paren.Loc)
	<< FixItHint::CreateRemoval(Paren.EndLoc);
	}

	/// Helper for figuring out the default CC for a function declarator type. If
	/// this is the outermost chunk, then we can determine the CC from the
	/// declarator context. If not, then this could be either a member function
	/// type or normal function type.
	static CallingConv getCCForDeclaratorChunk(
	Sema &S, Declarator &D, const ParsedAttributesView &AttrList,
	const DeclaratorChunk::FunctionTypeInfo &FTI, unsigned ChunkIndex) {
	assert(D.getTypeObject(ChunkIndex).Kind == DeclaratorChunk::Function);

	// Check for an explicit CC attribute.
	for (const ParsedAttr &AL : AttrList) {
	switch (AL.getKind()) {
	CALLING_CONV_ATTRS_CASELIST : {
	// Ignore attributes that don't validate or can't apply to the
	// function type. We'll diagnose the failure to apply them in
	// handleFunctionTypeAttr.
	CallingConv CC;
	if (!S.CheckCallingConvAttr(AL, CC) &&
	(!FTI.isVariadic \|\| supportsVariadicCall(CC))) {
	return CC;
	}
	break;
	}

	default:
	break;
	}
	}

	bool IsCXXInstanceMethod = false;

	if (S.getLangOpts().CPlusPlus) {
	// Look inwards through parentheses to see if this chunk will form a
	// member pointer type or if we're the declarator. Any type attributes
	// between here and there will override the CC we choose here.
	unsigned I = ChunkIndex;
	bool FoundNonParen = false;
	while (I && !FoundNonParen) {
	--I;
	if (D.getTypeObject(I).Kind != DeclaratorChunk::Paren)
	FoundNonParen = true;
	}

	if (FoundNonParen) {
	// If we're not the declarator, we're a regular function type unless we're
	// in a member pointer.
	IsCXXInstanceMethod =
	D.getTypeObject(I).Kind == DeclaratorChunk::MemberPointer;
	} else if (D.getContext() == DeclaratorContext::LambdaExprContext) {
	// This can only be a call operator for a lambda, which is an instance
	// method.
	IsCXXInstanceMethod = true;
	} else {
	// We're the innermost decl chunk, so must be a function declarator.
	assert(D.isFunctionDeclarator());

	// If we're inside a record, we're declaring a method, but it could be
	// explicitly or implicitly static.
	IsCXXInstanceMethod =
	D.isFirstDeclarationOfMember() &&
	D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_typedef &&
	!D.isStaticMember();
	}
	}

	CallingConv CC = S.Context.getDefaultCallingConvention(FTI.isVariadic,
	IsCXXInstanceMethod);

	// Attribute AT_OpenCLKernel affects the calling convention for SPIR
	// and AMDGPU targets, hence it cannot be treated as a calling
	// convention attribute. This is the simplest place to infer
	// calling convention for OpenCL kernels.
	if (S.getLangOpts().OpenCL) {
	for (const ParsedAttr &AL : D.getDeclSpec().getAttributes()) {
	if (AL.getKind() == ParsedAttr::AT_OpenCLKernel) {
	CC = CC_OpenCLKernel;
	break;
	}
	}
	}

	return CC;
	}

	namespace {
	/// A simple notion of pointer kinds, which matches up with the various
	/// pointer declarators.
	enum class SimplePointerKind {
	Pointer,
	BlockPointer,
	MemberPointer,
	Array,
	};
	} // end anonymous namespace

	IdentifierInfo *Sema::getNullabilityKeyword(NullabilityKind nullability) {
	switch (nullability) {
	case NullabilityKind::NonNull:
	if (!Ident__Nonnull)
	Ident__Nonnull = PP.getIdentifierInfo("_Nonnull");
	return Ident__Nonnull;

	case NullabilityKind::Nullable:
	if (!Ident__Nullable)
	Ident__Nullable = PP.getIdentifierInfo("_Nullable");
	return Ident__Nullable;

	case NullabilityKind::Unspecified:
	if (!Ident__Null_unspecified)
	Ident__Null_unspecified = PP.getIdentifierInfo("_Null_unspecified");
	return Ident__Null_unspecified;
	}
	llvm_unreachable("Unknown nullability kind.");
	}

	/// Retrieve the identifier "NSError".
	IdentifierInfo *Sema::getNSErrorIdent() {
	if (!Ident_NSError)
	Ident_NSError = PP.getIdentifierInfo("NSError");

	return Ident_NSError;
	}

	/// Check whether there is a nullability attribute of any kind in the given
	/// attribute list.
	static bool hasNullabilityAttr(const ParsedAttributesView &attrs) {
	for (const ParsedAttr &AL : attrs) {
	if (AL.getKind() == ParsedAttr::AT_TypeNonNull \|\|
	AL.getKind() == ParsedAttr::AT_TypeNullable \|\|
	AL.getKind() == ParsedAttr::AT_TypeNullUnspecified)
	return true;
	}

	return false;
	}

	namespace {
	/// Describes the kind of a pointer a declarator describes.
	enum class PointerDeclaratorKind {
	// Not a pointer.
	NonPointer,
	// Single-level pointer.
	SingleLevelPointer,
	// Multi-level pointer (of any pointer kind).
	MultiLevelPointer,
	// CFFooRef*
	MaybePointerToCFRef,
	// CFErrorRef*
	CFErrorRefPointer,
	// NSError**
	NSErrorPointerPointer,
	};

	/// Describes a declarator chunk wrapping a pointer that marks inference as
	/// unexpected.
	// These values must be kept in sync with diagnostics.
	enum class PointerWrappingDeclaratorKind {
	/// Pointer is top-level.
	None = -1,
	/// Pointer is an array element.
	Array = 0,
	/// Pointer is the referent type of a C++ reference.
	Reference = 1
	};
	} // end anonymous namespace

	/// Classify the given declarator, whose type-specified is \c type, based on
	/// what kind of pointer it refers to.
	///
	/// This is used to determine the default nullability.
	static PointerDeclaratorKind
	classifyPointerDeclarator(Sema &S, QualType type, Declarator &declarator,
	PointerWrappingDeclaratorKind &wrappingKind) {
	unsigned numNormalPointers = 0;

	// For any dependent type, we consider it a non-pointer.
	if (type->isDependentType())
	return PointerDeclaratorKind::NonPointer;

	// Look through the declarator chunks to identify pointers.
	for (unsigned i = 0, n = declarator.getNumTypeObjects(); i != n; ++i) {
	DeclaratorChunk &chunk = declarator.getTypeObject(i);
	switch (chunk.Kind) {
	case DeclaratorChunk::Array:
	if (numNormalPointers == 0)
	wrappingKind = PointerWrappingDeclaratorKind::Array;
	break;

	case DeclaratorChunk::Function:
	case DeclaratorChunk::Pipe:
	break;

	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::MemberPointer:
	return numNormalPointers > 0 ? PointerDeclaratorKind::MultiLevelPointer
	: PointerDeclaratorKind::SingleLevelPointer;

	case DeclaratorChunk::Paren:
	break;

	case DeclaratorChunk::Reference:
	if (numNormalPointers == 0)
	wrappingKind = PointerWrappingDeclaratorKind::Reference;
	break;

	case DeclaratorChunk::Pointer:
	++numNormalPointers;
	if (numNormalPointers > 2)
	return PointerDeclaratorKind::MultiLevelPointer;
	break;
	}
	}

	// Then, dig into the type specifier itself.
	unsigned numTypeSpecifierPointers = 0;
	do {
	// Decompose normal pointers.
	if (auto ptrType = type->getAs<PointerType>()) {
	++numNormalPointers;

	if (numNormalPointers > 2)
	return PointerDeclaratorKind::MultiLevelPointer;

	type = ptrType->getPointeeType();
	++numTypeSpecifierPointers;
	continue;
	}

	// Decompose block pointers.
	if (type->getAs<BlockPointerType>()) {
	return numNormalPointers > 0 ? PointerDeclaratorKind::MultiLevelPointer
	: PointerDeclaratorKind::SingleLevelPointer;
	}

	// Decompose member pointers.
	if (type->getAs<MemberPointerType>()) {
	return numNormalPointers > 0 ? PointerDeclaratorKind::MultiLevelPointer
	: PointerDeclaratorKind::SingleLevelPointer;
	}

	// Look at Objective-C object pointers.
	if (auto objcObjectPtr = type->getAs<ObjCObjectPointerType>()) {
	++numNormalPointers;
	++numTypeSpecifierPointers;

	// If this is NSError**, report that.
	if (auto objcClassDecl = objcObjectPtr->getInterfaceDecl()) {
	if (objcClassDecl->getIdentifier() == S.getNSErrorIdent() &&
	numNormalPointers == 2 && numTypeSpecifierPointers < 2) {
	return PointerDeclaratorKind::NSErrorPointerPointer;
	}
	}

	break;
	}

	// Look at Objective-C class types.
	if (auto objcClass = type->getAs<ObjCInterfaceType>()) {
	if (objcClass->getInterface()->getIdentifier() == S.getNSErrorIdent()) {
	if (numNormalPointers == 2 && numTypeSpecifierPointers < 2)
	return PointerDeclaratorKind::NSErrorPointerPointer;
	}

	break;
	}

	// If at this point we haven't seen a pointer, we won't see one.
	if (numNormalPointers == 0)
	return PointerDeclaratorKind::NonPointer;

	if (auto recordType = type->getAs<RecordType>()) {
	RecordDecl *recordDecl = recordType->getDecl();

	bool isCFError = false;
	if (S.CFError) {
	// If we already know about CFError, test it directly.
	isCFError = (S.CFError == recordDecl);
	} else {
	// Check whether this is CFError, which we identify based on its bridge
	// to NSError. CFErrorRef used to be declared with "objc_bridge" but is
	// now declared with "objc_bridge_mutable", so look for either one of
	// the two attributes.
	if (recordDecl->getTagKind() == TTK_Struct && numNormalPointers > 0) {
	IdentifierInfo *bridgedType = nullptr;
	if (auto bridgeAttr = recordDecl->getAttr<ObjCBridgeAttr>())
	bridgedType = bridgeAttr->getBridgedType();
	else if (auto bridgeAttr =
	recordDecl->getAttr<ObjCBridgeMutableAttr>())
	bridgedType = bridgeAttr->getBridgedType();

	if (bridgedType == S.getNSErrorIdent()) {
	S.CFError = recordDecl;
	isCFError = true;
	}
	}
	}

	// If this is CFErrorRef*, report it as such.
	if (isCFError && numNormalPointers == 2 && numTypeSpecifierPointers < 2) {
	return PointerDeclaratorKind::CFErrorRefPointer;
	}
	break;
	}

	break;
	} while (true);

	switch (numNormalPointers) {
	case 0:
	return PointerDeclaratorKind::NonPointer;

	case 1:
	return PointerDeclaratorKind::SingleLevelPointer;

	case 2:
	return PointerDeclaratorKind::MaybePointerToCFRef;

	default:
	return PointerDeclaratorKind::MultiLevelPointer;
	}
	}

	static FileID getNullabilityCompletenessCheckFileID(Sema &S,
	SourceLocation loc) {
	// If we're anywhere in a function, method, or closure context, don't perform
	// completeness checks.
	for (DeclContext *ctx = S.CurContext; ctx; ctx = ctx->getParent()) {
	if (ctx->isFunctionOrMethod())
	return FileID();

	if (ctx->isFileContext())
	break;
	}

	// We only care about the expansion location.
	loc = S.SourceMgr.getExpansionLoc(loc);
	FileID file = S.SourceMgr.getFileID(loc);
	if (file.isInvalid())
	return FileID();

	// Retrieve file information.
	bool invalid = false;
	const SrcMgr::SLocEntry &sloc = S.SourceMgr.getSLocEntry(file, &invalid);
	if (invalid \|\| !sloc.isFile())
	return FileID();

	// We don't want to perform completeness checks on the main file or in
	// system headers.
	const SrcMgr::FileInfo &fileInfo = sloc.getFile();
	if (fileInfo.getIncludeLoc().isInvalid())
	return FileID();
	if (fileInfo.getFileCharacteristic() != SrcMgr::C_User &&
	S.Diags.getSuppressSystemWarnings()) {
	return FileID();
	}

	return file;
	}

	/// Creates a fix-it to insert a C-style nullability keyword at \p pointerLoc,
	/// taking into account whitespace before and after.
	static void fixItNullability(Sema &S, DiagnosticBuilder &Diag,
	SourceLocation PointerLoc,
	NullabilityKind Nullability) {
	assert(PointerLoc.isValid());
	if (PointerLoc.isMacroID())
	return;

	SourceLocation FixItLoc = S.getLocForEndOfToken(PointerLoc);
	if (!FixItLoc.isValid() \|\| FixItLoc == PointerLoc)
	return;

	const char *NextChar = S.SourceMgr.getCharacterData(FixItLoc);
	if (!NextChar)
	return;

	SmallString<32> InsertionTextBuf{" "};
	InsertionTextBuf += getNullabilitySpelling(Nullability);
	InsertionTextBuf += " ";
	StringRef InsertionText = InsertionTextBuf.str();

	if (isWhitespace(*NextChar)) {
	InsertionText = InsertionText.drop_back();
	} else if (NextChar[-1] == '[') {
	if (NextChar[0] == ']')
	InsertionText = InsertionText.drop_back().drop_front();
	else
	InsertionText = InsertionText.drop_front();
	} else if (!isIdentifierBody(NextChar[0], /allow dollar/true) &&
	!isIdentifierBody(NextChar[-1], /allow dollar/true)) {
	InsertionText = InsertionText.drop_back().drop_front();
	}

	Diag << FixItHint::CreateInsertion(FixItLoc, InsertionText);
	}

	static void emitNullabilityConsistencyWarning(Sema &S,
	SimplePointerKind PointerKind,
	SourceLocation PointerLoc,
	SourceLocation PointerEndLoc) {
	assert(PointerLoc.isValid());

	if (PointerKind == SimplePointerKind::Array) {
	S.Diag(PointerLoc, diag::warn_nullability_missing_array);
	} else {
	S.Diag(PointerLoc, diag::warn_nullability_missing)
	<< static_cast<unsigned>(PointerKind);
	}

	auto FixItLoc = PointerEndLoc.isValid() ? PointerEndLoc : PointerLoc;
	if (FixItLoc.isMacroID())
	return;

	auto addFixIt = [&](NullabilityKind Nullability) {
	auto Diag = S.Diag(FixItLoc, diag::note_nullability_fix_it);
	Diag << static_cast<unsigned>(Nullability);
	Diag << static_cast<unsigned>(PointerKind);
	fixItNullability(S, Diag, FixItLoc, Nullability);
	};
	addFixIt(NullabilityKind::Nullable);
	addFixIt(NullabilityKind::NonNull);
	}

	/// Complains about missing nullability if the file containing \p pointerLoc
	/// has other uses of nullability (either the keywords or the \c assume_nonnull
	/// pragma).
	///
	/// If the file has \e not seen other uses of nullability, this particular
	/// pointer is saved for possible later diagnosis. See recordNullabilitySeen().
	static void
	checkNullabilityConsistency(Sema &S, SimplePointerKind pointerKind,
	SourceLocation pointerLoc,
	SourceLocation pointerEndLoc = SourceLocation()) {
	// Determine which file we're performing consistency checking for.
	FileID file = getNullabilityCompletenessCheckFileID(S, pointerLoc);
	if (file.isInvalid())
	return;

	// If we haven't seen any type nullability in this file, we won't warn now
	// about anything.
	FileNullability &fileNullability = S.NullabilityMap[file];
	if (!fileNullability.SawTypeNullability) {
	// If this is the first pointer declarator in the file, and the appropriate
	// warning is on, record it in case we need to diagnose it retroactively.
	diag::kind diagKind;
	if (pointerKind == SimplePointerKind::Array)
	diagKind = diag::warn_nullability_missing_array;
	else
	diagKind = diag::warn_nullability_missing;

	if (fileNullability.PointerLoc.isInvalid() &&
	!S.Context.getDiagnostics().isIgnored(diagKind, pointerLoc)) {
	fileNullability.PointerLoc = pointerLoc;
	fileNullability.PointerEndLoc = pointerEndLoc;
	fileNullability.PointerKind = static_cast<unsigned>(pointerKind);
	}

	return;
	}

	// Complain about missing nullability.
	emitNullabilityConsistencyWarning(S, pointerKind, pointerLoc, pointerEndLoc);
	}

	/// Marks that a nullability feature has been used in the file containing
	/// \p loc.
	///
	/// If this file already had pointer types in it that were missing nullability,
	/// the first such instance is retroactively diagnosed.
	///
	/// \sa checkNullabilityConsistency
	static void recordNullabilitySeen(Sema &S, SourceLocation loc) {
	FileID file = getNullabilityCompletenessCheckFileID(S, loc);
	if (file.isInvalid())
	return;

	FileNullability &fileNullability = S.NullabilityMap[file];
	if (fileNullability.SawTypeNullability)
	return;
	fileNullability.SawTypeNullability = true;

	// If we haven't seen any type nullability before, now we have. Retroactively
	// diagnose the first unannotated pointer, if there was one.
	if (fileNullability.PointerLoc.isInvalid())
	return;

	auto kind = static_cast<SimplePointerKind>(fileNullability.PointerKind);
	emitNullabilityConsistencyWarning(S, kind, fileNullability.PointerLoc,
	fileNullability.PointerEndLoc);
	}

	/// Returns true if any of the declarator chunks before \p endIndex include a
	/// level of indirection: array, pointer, reference, or pointer-to-member.
	///
	/// Because declarator chunks are stored in outer-to-inner order, testing
	/// every chunk before \p endIndex is testing all chunks that embed the current
	/// chunk as part of their type.
	///
	/// It is legal to pass the result of Declarator::getNumTypeObjects() as the
	/// end index, in which case all chunks are tested.
	static bool hasOuterPointerLikeChunk(const Declarator &D, unsigned endIndex) {
	unsigned i = endIndex;
	while (i != 0) {
	// Walk outwards along the declarator chunks.
	--i;
	const DeclaratorChunk &DC = D.getTypeObject(i);
	switch (DC.Kind) {
	case DeclaratorChunk::Paren:
	break;
	case DeclaratorChunk::Array:
	case DeclaratorChunk::Pointer:
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::MemberPointer:
	return true;
	case DeclaratorChunk::Function:
	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::Pipe:
	// These are invalid anyway, so just ignore.
	break;
	}
	}
	return false;
	}

	static bool IsNoDerefableChunk(DeclaratorChunk Chunk) {
	return (Chunk.Kind == DeclaratorChunk::Pointer \|\|
	Chunk.Kind == DeclaratorChunk::Array);
	}

	template<typename AttrT>
	static AttrT *createSimpleAttr(ASTContext &Ctx, ParsedAttr &Attr) {
	Attr.setUsedAsTypeAttr();
	return ::new (Ctx)
	AttrT(Attr.getRange(), Ctx, Attr.getAttributeSpellingListIndex());
	}

	static Attr *createNullabilityAttr(ASTContext &Ctx, ParsedAttr &Attr,
	NullabilityKind NK) {
	switch (NK) {
	case NullabilityKind::NonNull:
	return createSimpleAttr<TypeNonNullAttr>(Ctx, Attr);

	case NullabilityKind::Nullable:
	return createSimpleAttr<TypeNullableAttr>(Ctx, Attr);

	case NullabilityKind::Unspecified:
	return createSimpleAttr<TypeNullUnspecifiedAttr>(Ctx, Attr);
	}
	llvm_unreachable("unknown NullabilityKind");
	}

	// Diagnose whether this is a case with the multiple addr spaces.
	// Returns true if this is an invalid case.
	// ISO/IEC TR 18037 S5.3 (amending C99 6.7.3): "No type shall be qualified
	// by qualifiers for two or more different address spaces."
	static bool DiagnoseMultipleAddrSpaceAttributes(Sema &S, LangAS ASOld,
	LangAS ASNew,
	SourceLocation AttrLoc) {
	if (ASOld != LangAS::Default) {
	if (ASOld != ASNew) {
	S.Diag(AttrLoc, diag::err_attribute_address_multiple_qualifiers);
	return true;
	}
	// Emit a warning if they are identical; it's likely unintended.
	S.Diag(AttrLoc,
	diag::warn_attribute_address_multiple_identical_qualifiers);
	}
	return false;
	}

	static TypeSourceInfo *
	GetTypeSourceInfoForDeclarator(TypeProcessingState &State,
	QualType T, TypeSourceInfo *ReturnTypeInfo);

	static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
	QualType declSpecType,
	TypeSourceInfo *TInfo) {
	// The TypeSourceInfo that this function returns will not be a null type.
	// If there is an error, this function will fill in a dummy type as fallback.
	QualType T = declSpecType;
	Declarator &D = state.getDeclarator();
	Sema &S = state.getSema();
	ASTContext &Context = S.Context;
	const LangOptions &LangOpts = S.getLangOpts();

	// The name we're declaring, if any.
	DeclarationName Name;
	if (D.getIdentifier())
	Name = D.getIdentifier();

	// Does this declaration declare a typedef-name?
	bool IsTypedefName =
	D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_typedef \|\|
	D.getContext() == DeclaratorContext::AliasDeclContext \|\|
	D.getContext() == DeclaratorContext::AliasTemplateContext;

	// Does T refer to a function type with a cv-qualifier or a ref-qualifier?
	bool IsQualifiedFunction = T->isFunctionProtoType() &&
	(!T->castAs<FunctionProtoType>()->getMethodQuals().empty() \|\|
	T->castAs<FunctionProtoType>()->getRefQualifier() != RQ_None);

	// If T is 'decltype(auto)', the only declarators we can have are parens
	// and at most one function declarator if this is a function declaration.
	// If T is a deduced class template specialization type, we can have no
	// declarator chunks at all.
	if (auto *DT = T->getAs<DeducedType>()) {
	const AutoType *AT = T->getAs<AutoType>();
	bool IsClassTemplateDeduction = isa<DeducedTemplateSpecializationType>(DT);
	if ((AT && AT->isDecltypeAuto()) \|\| IsClassTemplateDeduction) {
	for (unsigned I = 0, E = D.getNumTypeObjects(); I != E; ++I) {
	unsigned Index = E - I - 1;
	DeclaratorChunk &DeclChunk = D.getTypeObject(Index);
	unsigned DiagId = IsClassTemplateDeduction
	? diag::err_deduced_class_template_compound_type
	: diag::err_decltype_auto_compound_type;
	unsigned DiagKind = 0;
	switch (DeclChunk.Kind) {
	case DeclaratorChunk::Paren:
	// FIXME: Rejecting this is a little silly.
	if (IsClassTemplateDeduction) {
	DiagKind = 4;
	break;
	}
	continue;
	case DeclaratorChunk::Function: {
	if (IsClassTemplateDeduction) {
	DiagKind = 3;
	break;
	}
	unsigned FnIndex;
	if (D.isFunctionDeclarationContext() &&
	D.isFunctionDeclarator(FnIndex) && FnIndex == Index)
	continue;
	DiagId = diag::err_decltype_auto_function_declarator_not_declaration;
	break;
	}
	case DeclaratorChunk::Pointer:
	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::MemberPointer:
	DiagKind = 0;
	break;
	case DeclaratorChunk::Reference:
	DiagKind = 1;
	break;
	case DeclaratorChunk::Array:
	DiagKind = 2;
	break;
	case DeclaratorChunk::Pipe:
	break;
	}

	S.Diag(DeclChunk.Loc, DiagId) << DiagKind;
	D.setInvalidType(true);
	break;
	}
	}
	}

	// Determine whether we should infer _Nonnull on pointer types.
	Optional<NullabilityKind> inferNullability;
	bool inferNullabilityCS = false;
	bool inferNullabilityInnerOnly = false;
	bool inferNullabilityInnerOnlyComplete = false;

	// Are we in an assume-nonnull region?
	bool inAssumeNonNullRegion = false;
	SourceLocation assumeNonNullLoc = S.PP.getPragmaAssumeNonNullLoc();
	if (assumeNonNullLoc.isValid()) {
	inAssumeNonNullRegion = true;
	recordNullabilitySeen(S, assumeNonNullLoc);
	}

	// Whether to complain about missing nullability specifiers or not.
	enum {
	/// Never complain.
	CAMN_No,
	/// Complain on the inner pointers (but not the outermost
	/// pointer).
	CAMN_InnerPointers,
	/// Complain about any pointers that don't have nullability
	/// specified or inferred.
	CAMN_Yes
	} complainAboutMissingNullability = CAMN_No;
	unsigned NumPointersRemaining = 0;
	auto complainAboutInferringWithinChunk = PointerWrappingDeclaratorKind::None;

	if (IsTypedefName) {
	// For typedefs, we do not infer any nullability (the default),
	// and we only complain about missing nullability specifiers on
	// inner pointers.
	complainAboutMissingNullability = CAMN_InnerPointers;

	if (T->canHaveNullability(/ResultIfUnknown/false) &&
	!T->getNullability(S.Context)) {
	// Note that we allow but don't require nullability on dependent types.
	++NumPointersRemaining;
	}

	for (unsigned i = 0, n = D.getNumTypeObjects(); i != n; ++i) {
	DeclaratorChunk &chunk = D.getTypeObject(i);
	switch (chunk.Kind) {
	case DeclaratorChunk::Array:
	case DeclaratorChunk::Function:
	case DeclaratorChunk::Pipe:
	break;

	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::MemberPointer:
	++NumPointersRemaining;
	break;

	case DeclaratorChunk::Paren:
	case DeclaratorChunk::Reference:
	continue;

	case DeclaratorChunk::Pointer:
	++NumPointersRemaining;
	continue;
	}
	}
	} else {
	bool isFunctionOrMethod = false;
	switch (auto context = state.getDeclarator().getContext()) {
	case DeclaratorContext::ObjCParameterContext:
	case DeclaratorContext::ObjCResultContext:
	case DeclaratorContext::PrototypeContext:
	case DeclaratorContext::TrailingReturnContext:
	case DeclaratorContext::TrailingReturnVarContext:
	isFunctionOrMethod = true;
	LLVM_FALLTHROUGH;

	case DeclaratorContext::MemberContext:
	if (state.getDeclarator().isObjCIvar() && !isFunctionOrMethod) {
	complainAboutMissingNullability = CAMN_No;
	break;
	}

	// Weak properties are inferred to be nullable.
	if (state.getDeclarator().isObjCWeakProperty() && inAssumeNonNullRegion) {
	inferNullability = NullabilityKind::Nullable;
	break;
	}

	LLVM_FALLTHROUGH;

	case DeclaratorContext::FileContext:
	case DeclaratorContext::KNRTypeListContext: {
	complainAboutMissingNullability = CAMN_Yes;

	// Nullability inference depends on the type and declarator.
	auto wrappingKind = PointerWrappingDeclaratorKind::None;
	switch (classifyPointerDeclarator(S, T, D, wrappingKind)) {
	case PointerDeclaratorKind::NonPointer:
	case PointerDeclaratorKind::MultiLevelPointer:
	// Cannot infer nullability.
	break;

	case PointerDeclaratorKind::SingleLevelPointer:
	// Infer _Nonnull if we are in an assumes-nonnull region.
	if (inAssumeNonNullRegion) {
	complainAboutInferringWithinChunk = wrappingKind;
	inferNullability = NullabilityKind::NonNull;
	inferNullabilityCS =
	(context == DeclaratorContext::ObjCParameterContext \|\|
	context == DeclaratorContext::ObjCResultContext);
	}
	break;

	case PointerDeclaratorKind::CFErrorRefPointer:
	case PointerDeclaratorKind::NSErrorPointerPointer:
	// Within a function or method signature, infer _Nullable at both
	// levels.
	if (isFunctionOrMethod && inAssumeNonNullRegion)
	inferNullability = NullabilityKind::Nullable;
	break;

	case PointerDeclaratorKind::MaybePointerToCFRef:
	if (isFunctionOrMethod) {
	// On pointer-to-pointer parameters marked cf_returns_retained or
	// cf_returns_not_retained, if the outer pointer is explicit then
	// infer the inner pointer as _Nullable.
	auto hasCFReturnsAttr =
	[](const ParsedAttributesView &AttrList) -> bool {
	return AttrList.hasAttribute(ParsedAttr::AT_CFReturnsRetained) \|\|
	AttrList.hasAttribute(ParsedAttr::AT_CFReturnsNotRetained);
	};
	if (const auto *InnermostChunk = D.getInnermostNonParenChunk()) {
	if (hasCFReturnsAttr(D.getAttributes()) \|\|
	hasCFReturnsAttr(InnermostChunk->getAttrs()) \|\|
	hasCFReturnsAttr(D.getDeclSpec().getAttributes())) {
	inferNullability = NullabilityKind::Nullable;
	inferNullabilityInnerOnly = true;
	}
	}
	}
	break;
	}
	break;
	}

	case DeclaratorContext::ConversionIdContext:
	complainAboutMissingNullability = CAMN_Yes;
	break;

	case DeclaratorContext::AliasDeclContext:
	case DeclaratorContext::AliasTemplateContext:
	case DeclaratorContext::BlockContext:
	case DeclaratorContext::BlockLiteralContext:
	case DeclaratorContext::ConditionContext:
	case DeclaratorContext::CXXCatchContext:
	case DeclaratorContext::CXXNewContext:
	case DeclaratorContext::ForContext:
	case DeclaratorContext::InitStmtContext:
	case DeclaratorContext::LambdaExprContext:
	case DeclaratorContext::LambdaExprParameterContext:
	case DeclaratorContext::ObjCCatchContext:
	case DeclaratorContext::TemplateParamContext:
	case DeclaratorContext::TemplateArgContext:
	case DeclaratorContext::TemplateTypeArgContext:
	case DeclaratorContext::TypeNameContext:
	case DeclaratorContext::FunctionalCastContext:
	// Don't infer in these contexts.
	break;
	}
	}

	// Local function that returns true if its argument looks like a va_list.
	auto isVaList = [&S](QualType T) -> bool {
	auto *typedefTy = T->getAs<TypedefType>();
	if (!typedefTy)
	return false;
	TypedefDecl *vaListTypedef = S.Context.getBuiltinVaListDecl();
	do {
	if (typedefTy->getDecl() == vaListTypedef)
	return true;
	if (auto *name = typedefTy->getDecl()->getIdentifier())
	if (name->isStr("va_list"))
	return true;
	typedefTy = typedefTy->desugar()->getAs<TypedefType>();
	} while (typedefTy);
	return false;
	};

	// Local function that checks the nullability for a given pointer declarator.
	// Returns true if _Nonnull was inferred.
	auto inferPointerNullability =
	[&](SimplePointerKind pointerKind, SourceLocation pointerLoc,
	SourceLocation pointerEndLoc,
	ParsedAttributesView &attrs, AttributePool &Pool) -> ParsedAttr * {
	// We've seen a pointer.
	if (NumPointersRemaining > 0)
	--NumPointersRemaining;

	// If a nullability attribute is present, there's nothing to do.
	if (hasNullabilityAttr(attrs))
	return nullptr;

	// If we're supposed to infer nullability, do so now.
	if (inferNullability && !inferNullabilityInnerOnlyComplete) {
	ParsedAttr::Syntax syntax = inferNullabilityCS
	? ParsedAttr::AS_ContextSensitiveKeyword
	: ParsedAttr::AS_Keyword;
	ParsedAttr *nullabilityAttr = Pool.create(
	S.getNullabilityKeyword(*inferNullability), SourceRange(pointerLoc),
	nullptr, SourceLocation(), nullptr, 0, syntax);

	attrs.addAtEnd(nullabilityAttr);

	if (inferNullabilityCS) {
	state.getDeclarator().getMutableDeclSpec().getObjCQualifiers()
	->setObjCDeclQualifier(ObjCDeclSpec::DQ_CSNullability);
	}

	if (pointerLoc.isValid() &&
	complainAboutInferringWithinChunk !=
	PointerWrappingDeclaratorKind::None) {
	auto Diag =
	S.Diag(pointerLoc, diag::warn_nullability_inferred_on_nested_type);
	Diag << static_cast<int>(complainAboutInferringWithinChunk);
	fixItNullability(S, Diag, pointerLoc, NullabilityKind::NonNull);
	}

	if (inferNullabilityInnerOnly)
	inferNullabilityInnerOnlyComplete = true;
	return nullabilityAttr;
	}

	// If we're supposed to complain about missing nullability, do so
	// now if it's truly missing.
	switch (complainAboutMissingNullability) {
	case CAMN_No:
	break;

	case CAMN_InnerPointers:
	if (NumPointersRemaining == 0)
	break;
	LLVM_FALLTHROUGH;

	case CAMN_Yes:
	checkNullabilityConsistency(S, pointerKind, pointerLoc, pointerEndLoc);
	}
	return nullptr;
	};

	// If the type itself could have nullability but does not, infer pointer
	// nullability and perform consistency checking.
	if (S.CodeSynthesisContexts.empty()) {
	if (T->canHaveNullability(/ResultIfUnknown/false) &&
	!T->getNullability(S.Context)) {
	if (isVaList(T)) {
	// Record that we've seen a pointer, but do nothing else.
	if (NumPointersRemaining > 0)
	--NumPointersRemaining;
	} else {
	SimplePointerKind pointerKind = SimplePointerKind::Pointer;
	if (T->isBlockPointerType())
	pointerKind = SimplePointerKind::BlockPointer;
	else if (T->isMemberPointerType())
	pointerKind = SimplePointerKind::MemberPointer;

	if (auto *attr = inferPointerNullability(
	pointerKind, D.getDeclSpec().getTypeSpecTypeLoc(),
	D.getDeclSpec().getEndLoc(),
	D.getMutableDeclSpec().getAttributes(),
	D.getMutableDeclSpec().getAttributePool())) {
	T = state.getAttributedType(
	createNullabilityAttr(Context, attr, inferNullability), T, T);
	}
	}
	}

	if (complainAboutMissingNullability == CAMN_Yes &&
	T->isArrayType() && !T->getNullability(S.Context) && !isVaList(T) &&
	D.isPrototypeContext() &&
	!hasOuterPointerLikeChunk(D, D.getNumTypeObjects())) {
	checkNullabilityConsistency(S, SimplePointerKind::Array,
	D.getDeclSpec().getTypeSpecTypeLoc());
	}
	}

	bool ExpectNoDerefChunk =
	state.getCurrentAttributes().hasAttribute(ParsedAttr::AT_NoDeref);

	// Walk the DeclTypeInfo, building the recursive type as we go.
	// DeclTypeInfos are ordered from the identifier out, which is
	// opposite of what we want :).
	for (unsigned i = 0, e = D.getNumTypeObjects(); i != e; ++i) {
	unsigned chunkIndex = e - i - 1;
	state.setCurrentChunkIndex(chunkIndex);
	DeclaratorChunk &DeclType = D.getTypeObject(chunkIndex);
	IsQualifiedFunction &= DeclType.Kind == DeclaratorChunk::Paren;
	switch (DeclType.Kind) {
	case DeclaratorChunk::Paren:
	if (i == 0)
	warnAboutRedundantParens(S, D, T);
	T = S.BuildParenType(T);
	break;
	case DeclaratorChunk::BlockPointer:
	// If blocks are disabled, emit an error.
	if (!LangOpts.Blocks)
	S.Diag(DeclType.Loc, diag::err_blocks_disable) << LangOpts.OpenCL;

	// Handle pointer nullability.
	inferPointerNullability(SimplePointerKind::BlockPointer, DeclType.Loc,
	DeclType.EndLoc, DeclType.getAttrs(),
	state.getDeclarator().getAttributePool());

	T = S.BuildBlockPointerType(T, D.getIdentifierLoc(), Name);
	if (DeclType.Cls.TypeQuals \|\| LangOpts.OpenCL) {
	// OpenCL v2.0, s6.12.5 - Block variable declarations are implicitly
	// qualified with const.
	if (LangOpts.OpenCL)
	DeclType.Cls.TypeQuals \|= DeclSpec::TQ_const;
	T = S.BuildQualifiedType(T, DeclType.Loc, DeclType.Cls.TypeQuals);
	}
	break;
	case DeclaratorChunk::Pointer:
	// Verify that we're not building a pointer to pointer to function with
	// exception specification.
	if (LangOpts.CPlusPlus && S.CheckDistantExceptionSpec(T)) {
	S.Diag(D.getIdentifierLoc(), diag::err_distant_exception_spec);
	D.setInvalidType(true);
	// Build the type anyway.
	}

	// Handle pointer nullability
	inferPointerNullability(SimplePointerKind::Pointer, DeclType.Loc,
	DeclType.EndLoc, DeclType.getAttrs(),
	state.getDeclarator().getAttributePool());

	if (LangOpts.ObjC && T->getAs<ObjCObjectType>()) {
	T = Context.getObjCObjectPointerType(T);
	if (DeclType.Ptr.TypeQuals)
	T = S.BuildQualifiedType(T, DeclType.Loc, DeclType.Ptr.TypeQuals);
	break;
	}

	// OpenCL v2.0 s6.9b - Pointer to image/sampler cannot be used.
	// OpenCL v2.0 s6.13.16.1 - Pointer to pipe cannot be used.
	// OpenCL v2.0 s6.12.5 - Pointers to Blocks are not allowed.
	if (LangOpts.OpenCL) {
	if (T->isImageType() \|\| T->isSamplerT() \|\| T->isPipeType() \|\|
	T->isBlockPointerType()) {
	S.Diag(D.getIdentifierLoc(), diag::err_opencl_pointer_to_type) << T;
	D.setInvalidType(true);
	}
	}

	T = S.BuildPointerType(T, DeclType.Loc, Name);
	if (DeclType.Ptr.TypeQuals)
	T = S.BuildQualifiedType(T, DeclType.Loc, DeclType.Ptr.TypeQuals);
	break;
	case DeclaratorChunk::Reference: {
	// Verify that we're not building a reference to pointer to function with
	// exception specification.
	if (LangOpts.CPlusPlus && S.CheckDistantExceptionSpec(T)) {
	S.Diag(D.getIdentifierLoc(), diag::err_distant_exception_spec);
	D.setInvalidType(true);
	// Build the type anyway.
	}
	T = S.BuildReferenceType(T, DeclType.Ref.LValueRef, DeclType.Loc, Name);

	if (DeclType.Ref.HasRestrict)
	T = S.BuildQualifiedType(T, DeclType.Loc, Qualifiers::Restrict);
	break;
	}
	case DeclaratorChunk::Array: {
	// Verify that we're not building an array of pointers to function with
	// exception specification.
	if (LangOpts.CPlusPlus && S.CheckDistantExceptionSpec(T)) {
	S.Diag(D.getIdentifierLoc(), diag::err_distant_exception_spec);
	D.setInvalidType(true);
	// Build the type anyway.
	}
	DeclaratorChunk::ArrayTypeInfo &ATI = DeclType.Arr;
	Expr ArraySize = static_cast<Expr>(ATI.NumElts);
	ArrayType::ArraySizeModifier ASM;
	if (ATI.isStar)
	ASM = ArrayType::Star;
	else if (ATI.hasStatic)
	ASM = ArrayType::Static;
	else
	ASM = ArrayType::Normal;
	if (ASM == ArrayType::Star && !D.isPrototypeContext()) {
	// FIXME: This check isn't quite right: it allows star in prototypes
	// for function definitions, and disallows some edge cases detailed
	// in http://gcc.gnu.org/ml/gcc-patches/2009-02/msg00133.html
	S.Diag(DeclType.Loc, diag::err_array_star_outside_prototype);
	ASM = ArrayType::Normal;
	D.setInvalidType(true);
	}

	// C99 6.7.5.2p1: The optional type qualifiers and the keyword static
	// shall appear only in a declaration of a function parameter with an
	// array type, ...
	if (ASM == ArrayType::Static \|\| ATI.TypeQuals) {
	if (!(D.isPrototypeContext() \|\|
	D.getContext() == DeclaratorContext::KNRTypeListContext)) {
	S.Diag(DeclType.Loc, diag::err_array_static_outside_prototype) <<
	(ASM == ArrayType::Static ? "'static'" : "type qualifier");
	// Remove the 'static' and the type qualifiers.
	if (ASM == ArrayType::Static)
	ASM = ArrayType::Normal;
	ATI.TypeQuals = 0;
	D.setInvalidType(true);
	}

	// C99 6.7.5.2p1: ... and then only in the outermost array type
	// derivation.
	if (hasOuterPointerLikeChunk(D, chunkIndex)) {
	S.Diag(DeclType.Loc, diag::err_array_static_not_outermost) <<
	(ASM == ArrayType::Static ? "'static'" : "type qualifier");
	if (ASM == ArrayType::Static)
	ASM = ArrayType::Normal;
	ATI.TypeQuals = 0;
	D.setInvalidType(true);
	}
	}
	const AutoType *AT = T->getContainedAutoType();
	// Allow arrays of auto if we are a generic lambda parameter.
	// i.e. [](auto (&array)[5]) { return array[0]; }; OK
	if (AT &&
	D.getContext() != DeclaratorContext::LambdaExprParameterContext) {
	// We've already diagnosed this for decltype(auto).
	if (!AT->isDecltypeAuto())
	S.Diag(DeclType.Loc, diag::err_illegal_decl_array_of_auto)
	<< getPrintableNameForEntity(Name) << T;
	T = QualType();
	break;
	}

	// Array parameters can be marked nullable as well, although it's not
	// necessary if they're marked 'static'.
	if (complainAboutMissingNullability == CAMN_Yes &&
	!hasNullabilityAttr(DeclType.getAttrs()) &&
	ASM != ArrayType::Static &&
	D.isPrototypeContext() &&
	!hasOuterPointerLikeChunk(D, chunkIndex)) {
	checkNullabilityConsistency(S, SimplePointerKind::Array, DeclType.Loc);
	}

	T = S.BuildArrayType(T, ASM, ArraySize, ATI.TypeQuals,
	SourceRange(DeclType.Loc, DeclType.EndLoc), Name);
	break;
	}
	case DeclaratorChunk::Function: {
	// If the function declarator has a prototype (i.e. it is not () and
	// does not have a K&R-style identifier list), then the arguments are part
	// of the type, otherwise the argument list is ().
	DeclaratorChunk::FunctionTypeInfo &FTI = DeclType.Fun;
	IsQualifiedFunction =
	FTI.hasMethodTypeQualifiers() \|\| FTI.hasRefQualifier();

	// Check for auto functions and trailing return type and adjust the
	// return type accordingly.
	if (!D.isInvalidType()) {
	// trailing-return-type is only required if we're declaring a function,
	// and not, for instance, a pointer to a function.
	if (D.getDeclSpec().hasAutoTypeSpec() &&
	!FTI.hasTrailingReturnType() && chunkIndex == 0) {
	if (!S.getLangOpts().CPlusPlus14) {
	S.Diag(D.getDeclSpec().getTypeSpecTypeLoc(),
	D.getDeclSpec().getTypeSpecType() == DeclSpec::TST_auto
	? diag::err_auto_missing_trailing_return
	: diag::err_deduced_return_type);
	T = Context.IntTy;
	D.setInvalidType(true);
	} else {
	S.Diag(D.getDeclSpec().getTypeSpecTypeLoc(),
	diag::warn_cxx11_compat_deduced_return_type);
	}
	} else if (FTI.hasTrailingReturnType()) {
	// T must be exactly 'auto' at this point. See CWG issue 681.
	if (isa<ParenType>(T)) {
	S.Diag(D.getBeginLoc(), diag::err_trailing_return_in_parens)
	<< T << D.getSourceRange();
	D.setInvalidType(true);
	} else if (D.getName().getKind() ==
	UnqualifiedIdKind::IK_DeductionGuideName) {
	if (T != Context.DependentTy) {
	S.Diag(D.getDeclSpec().getBeginLoc(),
	diag::err_deduction_guide_with_complex_decl)
	<< D.getSourceRange();
	D.setInvalidType(true);
	}
	} else if (D.getContext() != DeclaratorContext::LambdaExprContext &&
	(T.hasQualifiers() \|\| !isa<AutoType>(T) \|\|
	cast<AutoType>(T)->getKeyword() !=
	AutoTypeKeyword::Auto)) {
	S.Diag(D.getDeclSpec().getTypeSpecTypeLoc(),
	diag::err_trailing_return_without_auto)
	<< T << D.getDeclSpec().getSourceRange();
	D.setInvalidType(true);
	}
	T = S.GetTypeFromParser(FTI.getTrailingReturnType(), &TInfo);
	if (T.isNull()) {
	// An error occurred parsing the trailing return type.
	T = Context.IntTy;
	D.setInvalidType(true);
	}
	} else {
	// This function type is not the type of the entity being declared,
	// so checking the 'auto' is not the responsibility of this chunk.
	}
	}

	// C99 6.7.5.3p1: The return type may not be a function or array type.
	// For conversion functions, we'll diagnose this particular error later.
	if (!D.isInvalidType() && (T->isArrayType() \|\| T->isFunctionType()) &&
	(D.getName().getKind() !=
	UnqualifiedIdKind::IK_ConversionFunctionId)) {
	unsigned diagID = diag::err_func_returning_array_function;
	// Last processing chunk in block context means this function chunk
	// represents the block.
	if (chunkIndex == 0 &&
	D.getContext() == DeclaratorContext::BlockLiteralContext)
	diagID = diag::err_block_returning_array_function;
	S.Diag(DeclType.Loc, diagID) << T->isFunctionType() << T;
	T = Context.IntTy;
	D.setInvalidType(true);
	}

	// Do not allow returning half FP value.
	// FIXME: This really should be in BuildFunctionType.
	if (T->isHalfType()) {
	if (S.getLangOpts().OpenCL) {
	if (!S.getOpenCLOptions().isEnabled("cl_khr_fp16")) {
	S.Diag(D.getIdentifierLoc(), diag::err_opencl_invalid_return)
	<< T << 0 /pointer hint/;
	D.setInvalidType(true);
	}
	} else if (!S.getLangOpts().HalfArgsAndReturns) {
	S.Diag(D.getIdentifierLoc(),
	diag::err_parameters_retval_cannot_have_fp16_type) << 1;
	D.setInvalidType(true);
	}
	}

	if (LangOpts.OpenCL) {
	// OpenCL v2.0 s6.12.5 - A block cannot be the return value of a
	// function.
	if (T->isBlockPointerType() \|\| T->isImageType() \|\| T->isSamplerT() \|\|
	T->isPipeType()) {
	S.Diag(D.getIdentifierLoc(), diag::err_opencl_invalid_return)
	<< T << 1 /hint off/;
	D.setInvalidType(true);
	}
	// OpenCL doesn't support variadic functions and blocks
	// (s6.9.e and s6.12.5 OpenCL v2.0) except for printf.
	// We also allow here any toolchain reserved identifiers.
	if (FTI.isVariadic &&
	!(D.getIdentifier() &&
	((D.getIdentifier()->getName() == "printf" &&
	(LangOpts.OpenCLCPlusPlus \|\| LangOpts.OpenCLVersion >= 120)) \|\|
	D.getIdentifier()->getName().startswith("__")))) {
	S.Diag(D.getIdentifierLoc(), diag::err_opencl_variadic_function);
	D.setInvalidType(true);
	}
	}

	// Methods cannot return interface types. All ObjC objects are
	// passed by reference.
	if (T->isObjCObjectType()) {
	SourceLocation DiagLoc, FixitLoc;
	if (TInfo) {
	DiagLoc = TInfo->getTypeLoc().getBeginLoc();
	FixitLoc = S.getLocForEndOfToken(TInfo->getTypeLoc().getEndLoc());
	} else {
	DiagLoc = D.getDeclSpec().getTypeSpecTypeLoc();
	FixitLoc = S.getLocForEndOfToken(D.getDeclSpec().getEndLoc());
	}
	S.Diag(DiagLoc, diag::err_object_cannot_be_passed_returned_by_value)
	<< 0 << T
	<< FixItHint::CreateInsertion(FixitLoc, "*");

	T = Context.getObjCObjectPointerType(T);
	if (TInfo) {
	TypeLocBuilder TLB;
	TLB.pushFullCopy(TInfo->getTypeLoc());
	ObjCObjectPointerTypeLoc TLoc = TLB.push<ObjCObjectPointerTypeLoc>(T);
	TLoc.setStarLoc(FixitLoc);
	TInfo = TLB.getTypeSourceInfo(Context, T);
	}

	D.setInvalidType(true);
	}

	// cv-qualifiers on return types are pointless except when the type is a
	// class type in C++.
	if ((T.getCVRQualifiers() \|\| T->isAtomicType()) &&
	!(S.getLangOpts().CPlusPlus &&
	(T->isDependentType() \|\| T->isRecordType()))) {
	if (T->isVoidType() && !S.getLangOpts().CPlusPlus &&
	D.getFunctionDefinitionKind() == FDK_Definition) {
	// [6.9.1/3] qualified void return is invalid on a C
	// function definition. Apparently ok on declarations and
	// in C++ though (!)
	S.Diag(DeclType.Loc, diag::err_func_returning_qualified_void) << T;
	} else
	diagnoseRedundantReturnTypeQualifiers(S, T, D, chunkIndex);
	}

	// Objective-C ARC ownership qualifiers are ignored on the function
	// return type (by type canonicalization). Complain if this attribute
	// was written here.
	if (T.getQualifiers().hasObjCLifetime()) {
	SourceLocation AttrLoc;
	if (chunkIndex + 1 < D.getNumTypeObjects()) {
	DeclaratorChunk ReturnTypeChunk = D.getTypeObject(chunkIndex + 1);
	for (const ParsedAttr &AL : ReturnTypeChunk.getAttrs()) {
	if (AL.getKind() == ParsedAttr::AT_ObjCOwnership) {
	AttrLoc = AL.getLoc();
	break;
	}
	}
	}
	if (AttrLoc.isInvalid()) {
	for (const ParsedAttr &AL : D.getDeclSpec().getAttributes()) {
	if (AL.getKind() == ParsedAttr::AT_ObjCOwnership) {
	AttrLoc = AL.getLoc();
	break;
	}
	}
	}

	if (AttrLoc.isValid()) {
	// The ownership attributes are almost always written via
	// the predefined
	// __strong/__weak/__autoreleasing/__unsafe_unretained.
	if (AttrLoc.isMacroID())
	AttrLoc =
	S.SourceMgr.getImmediateExpansionRange(AttrLoc).getBegin();

	S.Diag(AttrLoc, diag::warn_arc_lifetime_result_type)
	<< T.getQualifiers().getObjCLifetime();
	}
	}

	if (LangOpts.CPlusPlus && D.getDeclSpec().hasTagDefinition()) {
	// C++ [dcl.fct]p6:
	// Types shall not be defined in return or parameter types.
	TagDecl *Tag = cast<TagDecl>(D.getDeclSpec().getRepAsDecl());
	S.Diag(Tag->getLocation(), diag::err_type_defined_in_result_type)
	<< Context.getTypeDeclType(Tag);
	}

	// Exception specs are not allowed in typedefs. Complain, but add it
	// anyway.
	if (IsTypedefName && FTI.getExceptionSpecType() && !LangOpts.CPlusPlus17)
	S.Diag(FTI.getExceptionSpecLocBeg(),
	diag::err_exception_spec_in_typedef)
	<< (D.getContext() == DeclaratorContext::AliasDeclContext \|\|
	D.getContext() == DeclaratorContext::AliasTemplateContext);

	// If we see "T var();" or "T var(T());" at block scope, it is probably
	// an attempt to initialize a variable, not a function declaration.
	if (FTI.isAmbiguous)
	warnAboutAmbiguousFunction(S, D, DeclType, T);

	FunctionType::ExtInfo EI(
	getCCForDeclaratorChunk(S, D, DeclType.getAttrs(), FTI, chunkIndex));

	if (!FTI.NumParams && !FTI.isVariadic && !LangOpts.CPlusPlus
	&& !LangOpts.OpenCL) {
	// Simple void foo(), where the incoming T is the result type.
	T = Context.getFunctionNoProtoType(T, EI);
	} else {
	// We allow a zero-parameter variadic function in C if the
	// function is marked with the "overloadable" attribute. Scan
	// for this attribute now.
	if (!FTI.NumParams && FTI.isVariadic && !LangOpts.CPlusPlus)
	if (!D.getAttributes().hasAttribute(ParsedAttr::AT_Overloadable))
	S.Diag(FTI.getEllipsisLoc(), diag::err_ellipsis_first_param);

	if (FTI.NumParams && FTI.Params[0].Param == nullptr) {
	// C99 6.7.5.3p3: Reject int(x,y,z) when it's not a function
	// definition.
	S.Diag(FTI.Params[0].IdentLoc,
	diag::err_ident_list_in_fn_declaration);
	D.setInvalidType(true);
	// Recover by creating a K&R-style function type.
	T = Context.getFunctionNoProtoType(T, EI);
	break;
	}

	FunctionProtoType::ExtProtoInfo EPI;
	EPI.ExtInfo = EI;
	EPI.Variadic = FTI.isVariadic;
	EPI.HasTrailingReturn = FTI.hasTrailingReturnType();
	EPI.TypeQuals.addCVRUQualifiers(
	FTI.MethodQualifiers ? FTI.MethodQualifiers->getTypeQualifiers()
	: 0);
	EPI.RefQualifier = !FTI.hasRefQualifier()? RQ_None
	: FTI.RefQualifierIsLValueRef? RQ_LValue
	: RQ_RValue;

	// Otherwise, we have a function with a parameter list that is
	// potentially variadic.
	SmallVector<QualType, 16> ParamTys;
	ParamTys.reserve(FTI.NumParams);

	SmallVector<FunctionProtoType::ExtParameterInfo, 16>
	ExtParameterInfos(FTI.NumParams);
	bool HasAnyInterestingExtParameterInfos = false;

	for (unsigned i = 0, e = FTI.NumParams; i != e; ++i) {
	ParmVarDecl *Param = cast<ParmVarDecl>(FTI.Params[i].Param);
	QualType ParamTy = Param->getType();
	assert(!ParamTy.isNull() && "Couldn't parse type?");

	// Look for 'void'. void is allowed only as a single parameter to a
	// function with no other parameters (C99 6.7.5.3p10). We record
	// int(void) as a FunctionProtoType with an empty parameter list.
	if (ParamTy->isVoidType()) {
	// If this is something like 'float(int, void)', reject it. 'void'
	// is an incomplete type (C99 6.2.5p19) and function decls cannot
	// have parameters of incomplete type.
	if (FTI.NumParams != 1 \|\| FTI.isVariadic) {
	S.Diag(DeclType.Loc, diag::err_void_only_param);
	ParamTy = Context.IntTy;
	Param->setType(ParamTy);
	} else if (FTI.Params[i].Ident) {
	// Reject, but continue to parse 'int(void abc)'.
	S.Diag(FTI.Params[i].IdentLoc, diag::err_param_with_void_type);
	ParamTy = Context.IntTy;
	Param->setType(ParamTy);
	} else {
	// Reject, but continue to parse 'float(const void)'.
	if (ParamTy.hasQualifiers())
	S.Diag(DeclType.Loc, diag::err_void_param_qualified);

	// Do not add 'void' to the list.
	break;
	}
	} else if (ParamTy->isHalfType()) {
	// Disallow half FP parameters.
	// FIXME: This really should be in BuildFunctionType.
	if (S.getLangOpts().OpenCL) {
	if (!S.getOpenCLOptions().isEnabled("cl_khr_fp16")) {
	S.Diag(Param->getLocation(),
	diag::err_opencl_half_param) << ParamTy;
	D.setInvalidType();
	Param->setInvalidDecl();
	}
	} else if (!S.getLangOpts().HalfArgsAndReturns) {
	S.Diag(Param->getLocation(),
	diag::err_parameters_retval_cannot_have_fp16_type) << 0;
	D.setInvalidType();
	}
	} else if (!FTI.hasPrototype) {
	if (ParamTy->isPromotableIntegerType()) {
	ParamTy = Context.getPromotedIntegerType(ParamTy);
	Param->setKNRPromoted(true);
	} else if (const BuiltinType* BTy = ParamTy->getAs<BuiltinType>()) {
	if (BTy->getKind() == BuiltinType::Float) {
	ParamTy = Context.DoubleTy;
	Param->setKNRPromoted(true);
	}
	}
	}

	if (LangOpts.ObjCAutoRefCount && Param->hasAttr<NSConsumedAttr>()) {
	ExtParameterInfos[i] = ExtParameterInfos[i].withIsConsumed(true);
	HasAnyInterestingExtParameterInfos = true;
	}

	if (auto attr = Param->getAttr<ParameterABIAttr>()) {
	ExtParameterInfos[i] =
	ExtParameterInfos[i].withABI(attr->getABI());
	HasAnyInterestingExtParameterInfos = true;
	}

	if (Param->hasAttr<PassObjectSizeAttr>()) {
	ExtParameterInfos[i] = ExtParameterInfos[i].withHasPassObjectSize();
	HasAnyInterestingExtParameterInfos = true;
	}

	if (Param->hasAttr<NoEscapeAttr>()) {
	ExtParameterInfos[i] = ExtParameterInfos[i].withIsNoEscape(true);
	HasAnyInterestingExtParameterInfos = true;
	}

	ParamTys.push_back(ParamTy);
	}

	if (HasAnyInterestingExtParameterInfos) {
	EPI.ExtParameterInfos = ExtParameterInfos.data();
	checkExtParameterInfos(S, ParamTys, EPI,
	[&](unsigned i) { return FTI.Params[i].Param->getLocation(); });
	}

	SmallVector<QualType, 4> Exceptions;
	SmallVector<ParsedType, 2> DynamicExceptions;
	SmallVector<SourceRange, 2> DynamicExceptionRanges;
	Expr *NoexceptExpr = nullptr;

	if (FTI.getExceptionSpecType() == EST_Dynamic) {
	// FIXME: It's rather inefficient to have to split into two vectors
	// here.
	unsigned N = FTI.getNumExceptions();
	DynamicExceptions.reserve(N);
	DynamicExceptionRanges.reserve(N);
	for (unsigned I = 0; I != N; ++I) {
	DynamicExceptions.push_back(FTI.Exceptions[I].Ty);
	DynamicExceptionRanges.push_back(FTI.Exceptions[I].Range);
	}
	} else if (isComputedNoexcept(FTI.getExceptionSpecType())) {
	NoexceptExpr = FTI.NoexceptExpr;
	}

	S.checkExceptionSpecification(D.isFunctionDeclarationContext(),
	FTI.getExceptionSpecType(),
	DynamicExceptions,
	DynamicExceptionRanges,
	NoexceptExpr,
	Exceptions,
	EPI.ExceptionSpec);

	// FIXME: Set address space from attrs for C++ mode here.
	// OpenCLCPlusPlus: A class member function has an address space.
	auto IsClassMember = [&]() {
	return (!state.getDeclarator().getCXXScopeSpec().isEmpty() &&
	state.getDeclarator()
	.getCXXScopeSpec()
	.getScopeRep()
	->getKind() == NestedNameSpecifier::TypeSpec) \|\|
	state.getDeclarator().getContext() ==
	DeclaratorContext::MemberContext;
	};

	if (state.getSema().getLangOpts().OpenCLCPlusPlus && IsClassMember()) {
	LangAS ASIdx = LangAS::Default;
	// Take address space attr if any and mark as invalid to avoid adding
	// them later while creating QualType.
	if (FTI.MethodQualifiers)
	for (ParsedAttr &attr : FTI.MethodQualifiers->getAttributes()) {
	LangAS ASIdxNew = attr.asOpenCLLangAS();
	if (DiagnoseMultipleAddrSpaceAttributes(S, ASIdx, ASIdxNew,
	attr.getLoc()))
	D.setInvalidType(true);
	else
	ASIdx = ASIdxNew;
	}
	// If a class member function's address space is not set, set it to
	// __generic.
	LangAS AS =
	(ASIdx == LangAS::Default ? LangAS::opencl_generic : ASIdx);
	EPI.TypeQuals.addAddressSpace(AS);
	}
	T = Context.getFunctionType(T, ParamTys, EPI);
	}
	break;
	}
	case DeclaratorChunk::MemberPointer: {
	// The scope spec must refer to a class, or be dependent.
	CXXScopeSpec &SS = DeclType.Mem.Scope();
	QualType ClsType;

	// Handle pointer nullability.
	inferPointerNullability(SimplePointerKind::MemberPointer, DeclType.Loc,
	DeclType.EndLoc, DeclType.getAttrs(),
	state.getDeclarator().getAttributePool());

	if (SS.isInvalid()) {
	// Avoid emitting extra errors if we already errored on the scope.
	D.setInvalidType(true);
	} else if (S.isDependentScopeSpecifier(SS) \|\|
	dyn_cast_or_null<CXXRecordDecl>(S.computeDeclContext(SS))) {
	NestedNameSpecifier *NNS = SS.getScopeRep();
	NestedNameSpecifier *NNSPrefix = NNS->getPrefix();
	switch (NNS->getKind()) {
	case NestedNameSpecifier::Identifier:
	ClsType = Context.getDependentNameType(ETK_None, NNSPrefix,
	NNS->getAsIdentifier());
	break;

	case NestedNameSpecifier::Namespace:
	case NestedNameSpecifier::NamespaceAlias:
	case NestedNameSpecifier::Global:
	case NestedNameSpecifier::Super:
	llvm_unreachable("Nested-name-specifier must name a type");

	case NestedNameSpecifier::TypeSpec:
	case NestedNameSpecifier::TypeSpecWithTemplate:
	ClsType = QualType(NNS->getAsType(), 0);
	// Note: if the NNS has a prefix and ClsType is a nondependent
	// TemplateSpecializationType, then the NNS prefix is NOT included
	// in ClsType; hence we wrap ClsType into an ElaboratedType.
	// NOTE: in particular, no wrap occurs if ClsType already is an
	// Elaborated, DependentName, or DependentTemplateSpecialization.
	if (NNSPrefix && isa<TemplateSpecializationType>(NNS->getAsType()))
	ClsType = Context.getElaboratedType(ETK_None, NNSPrefix, ClsType);
	break;
	}
	} else {
	S.Diag(DeclType.Mem.Scope().getBeginLoc(),
	diag::err_illegal_decl_mempointer_in_nonclass)
	<< (D.getIdentifier() ? D.getIdentifier()->getName() : "type name")
	<< DeclType.Mem.Scope().getRange();
	D.setInvalidType(true);
	}

	if (!ClsType.isNull())
	T = S.BuildMemberPointerType(T, ClsType, DeclType.Loc,
	D.getIdentifier());
	if (T.isNull()) {
	T = Context.IntTy;
	D.setInvalidType(true);
	} else if (DeclType.Mem.TypeQuals) {
	T = S.BuildQualifiedType(T, DeclType.Loc, DeclType.Mem.TypeQuals);
	}
	break;
	}

	case DeclaratorChunk::Pipe: {
	T = S.BuildReadPipeType(T, DeclType.Loc);
	processTypeAttrs(state, T, TAL_DeclSpec,
	D.getMutableDeclSpec().getAttributes());
	break;
	}
	}

	if (T.isNull()) {
	D.setInvalidType(true);
	T = Context.IntTy;
	}

	// See if there are any attributes on this declarator chunk.
	processTypeAttrs(state, T, TAL_DeclChunk, DeclType.getAttrs());

	if (DeclType.Kind != DeclaratorChunk::Paren) {
	if (ExpectNoDerefChunk && !IsNoDerefableChunk(DeclType))
	S.Diag(DeclType.Loc, diag::warn_noderef_on_non_pointer_or_array);

	ExpectNoDerefChunk = state.didParseNoDeref();
	}
	}

	if (ExpectNoDerefChunk)
	S.Diag(state.getDeclarator().getBeginLoc(),
	diag::warn_noderef_on_non_pointer_or_array);

	// GNU warning -Wstrict-prototypes
	// Warn if a function declaration is without a prototype.
	// This warning is issued for all kinds of unprototyped function
	// declarations (i.e. function type typedef, function pointer etc.)
	// C99 6.7.5.3p14:
	// The empty list in a function declarator that is not part of a definition
	// of that function specifies that no information about the number or types
	// of the parameters is supplied.
	if (!LangOpts.CPlusPlus && D.getFunctionDefinitionKind() == FDK_Declaration) {
	bool IsBlock = false;
	for (const DeclaratorChunk &DeclType : D.type_objects()) {
	switch (DeclType.Kind) {
	case DeclaratorChunk::BlockPointer:
	IsBlock = true;
	break;
	case DeclaratorChunk::Function: {
	const DeclaratorChunk::FunctionTypeInfo &FTI = DeclType.Fun;
	// We supress the warning when there's no LParen location, as this
	// indicates the declaration was an implicit declaration, which gets
	// warned about separately via -Wimplicit-function-declaration.
	if (FTI.NumParams == 0 && !FTI.isVariadic && FTI.getLParenLoc().isValid())
	S.Diag(DeclType.Loc, diag::warn_strict_prototypes)
	<< IsBlock
	<< FixItHint::CreateInsertion(FTI.getRParenLoc(), "void");
	IsBlock = false;
	break;
	}
	default:
	break;
	}
	}
	}

	assert(!T.isNull() && "T must not be null after this point");

	if (LangOpts.CPlusPlus && T->isFunctionType()) {
	const FunctionProtoType *FnTy = T->getAs<FunctionProtoType>();
	assert(FnTy && "Why oh why is there not a FunctionProtoType here?");

	// C++ 8.3.5p4:
	// A cv-qualifier-seq shall only be part of the function type
	// for a nonstatic member function, the function type to which a pointer
	// to member refers, or the top-level function type of a function typedef
	// declaration.
	//
	// Core issue 547 also allows cv-qualifiers on function types that are
	// top-level template type arguments.
	enum { NonMember, Member, DeductionGuide } Kind = NonMember;
	if (D.getName().getKind() == UnqualifiedIdKind::IK_DeductionGuideName)
	Kind = DeductionGuide;
	else if (!D.getCXXScopeSpec().isSet()) {
	if ((D.getContext() == DeclaratorContext::MemberContext \|\|
	D.getContext() == DeclaratorContext::LambdaExprContext) &&
	!D.getDeclSpec().isFriendSpecified())
	Kind = Member;
	} else {
	DeclContext *DC = S.computeDeclContext(D.getCXXScopeSpec());
	if (!DC \|\| DC->isRecord())
	Kind = Member;
	}

	// C++11 [dcl.fct]p6 (w/DR1417):
	// An attempt to specify a function type with a cv-qualifier-seq or a
	// ref-qualifier (including by typedef-name) is ill-formed unless it is:
	// - the function type for a non-static member function,
	// - the function type to which a pointer to member refers,
	// - the top-level function type of a function typedef declaration or
	// alias-declaration,
	// - the type-id in the default argument of a type-parameter, or
	// - the type-id of a template-argument for a type-parameter
	//
	// FIXME: Checking this here is insufficient. We accept-invalid on:
	//
	// template<typename T> struct S { void f(T); };
	// S<int() const> s;
	//
	// ... for instance.
	if (IsQualifiedFunction &&
	!(Kind == Member &&
	D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_static) &&
	!IsTypedefName &&
	D.getContext() != DeclaratorContext::TemplateArgContext &&
	D.getContext() != DeclaratorContext::TemplateTypeArgContext) {
	SourceLocation Loc = D.getBeginLoc();
	SourceRange RemovalRange;
	unsigned I;
	if (D.isFunctionDeclarator(I)) {
	SmallVector<SourceLocation, 4> RemovalLocs;
	const DeclaratorChunk &Chunk = D.getTypeObject(I);
	assert(Chunk.Kind == DeclaratorChunk::Function);

	if (Chunk.Fun.hasRefQualifier())
	RemovalLocs.push_back(Chunk.Fun.getRefQualifierLoc());

	if (Chunk.Fun.hasMethodTypeQualifiers())
	Chunk.Fun.MethodQualifiers->forEachQualifier(
	[&](DeclSpec::TQ TypeQual, StringRef QualName,
	SourceLocation SL) { RemovalLocs.push_back(SL); });

	if (!RemovalLocs.empty()) {
	llvm::sort(RemovalLocs,
	BeforeThanCompare<SourceLocation>(S.getSourceManager()));
	RemovalRange = SourceRange(RemovalLocs.front(), RemovalLocs.back());
	Loc = RemovalLocs.front();
	}
	}

	S.Diag(Loc, diag::err_invalid_qualified_function_type)
	<< Kind << D.isFunctionDeclarator() << T
	<< getFunctionQualifiersAsString(FnTy)
	<< FixItHint::CreateRemoval(RemovalRange);

	// Strip the cv-qualifiers and ref-qualifiers from the type.
	FunctionProtoType::ExtProtoInfo EPI = FnTy->getExtProtoInfo();
	EPI.TypeQuals.removeCVRQualifiers();
	EPI.RefQualifier = RQ_None;

	T = Context.getFunctionType(FnTy->getReturnType(), FnTy->getParamTypes(),
	EPI);
	// Rebuild any parens around the identifier in the function type.
	for (unsigned i = 0, e = D.getNumTypeObjects(); i != e; ++i) {
	if (D.getTypeObject(i).Kind != DeclaratorChunk::Paren)
	break;
	T = S.BuildParenType(T);
	}
	}
	}

	// Apply any undistributed attributes from the declarator.
	processTypeAttrs(state, T, TAL_DeclName, D.getAttributes());

	// Diagnose any ignored type attributes.
	state.diagnoseIgnoredTypeAttrs(T);

	// C++0x [dcl.constexpr]p9:
	// A constexpr specifier used in an object declaration declares the object
	// as const.
	if (D.getDeclSpec().hasConstexprSpecifier() && T->isObjectType()) {
	T.addConst();
	}

	// If there was an ellipsis in the declarator, the declaration declares a
	// parameter pack whose type may be a pack expansion type.
	if (D.hasEllipsis()) {
	// C++0x [dcl.fct]p13:
	// A declarator-id or abstract-declarator containing an ellipsis shall
	// only be used in a parameter-declaration. Such a parameter-declaration
	// is a parameter pack (14.5.3). [...]
	switch (D.getContext()) {
	case DeclaratorContext::PrototypeContext:
	case DeclaratorContext::LambdaExprParameterContext:
	// C++0x [dcl.fct]p13:
	// [...] When it is part of a parameter-declaration-clause, the
	// parameter pack is a function parameter pack (14.5.3). The type T
	// of the declarator-id of the function parameter pack shall contain
	// a template parameter pack; each template parameter pack in T is
	// expanded by the function parameter pack.
	//
	// We represent function parameter packs as function parameters whose
	// type is a pack expansion.
	if (!T->containsUnexpandedParameterPack()) {
	S.Diag(D.getEllipsisLoc(),
	diag::err_function_parameter_pack_without_parameter_packs)
	<< T << D.getSourceRange();
	D.setEllipsisLoc(SourceLocation());
	} else {
	T = Context.getPackExpansionType(T, None);
	}
	break;
	case DeclaratorContext::TemplateParamContext:
	// C++0x [temp.param]p15:
	// If a template-parameter is a [...] is a parameter-declaration that
	// declares a parameter pack (8.3.5), then the template-parameter is a
	// template parameter pack (14.5.3).
	//
	// Note: core issue 778 clarifies that, if there are any unexpanded
	// parameter packs in the type of the non-type template parameter, then
	// it expands those parameter packs.
	if (T->containsUnexpandedParameterPack())
	T = Context.getPackExpansionType(T, None);
	else
	S.Diag(D.getEllipsisLoc(),
	LangOpts.CPlusPlus11
	? diag::warn_cxx98_compat_variadic_templates
	: diag::ext_variadic_templates);
	break;

	case DeclaratorContext::FileContext:
	case DeclaratorContext::KNRTypeListContext:
	case DeclaratorContext::ObjCParameterContext: // FIXME: special diagnostic
	// here?
	case DeclaratorContext::ObjCResultContext: // FIXME: special diagnostic
	// here?
	case DeclaratorContext::TypeNameContext:
	case DeclaratorContext::FunctionalCastContext:
	case DeclaratorContext::CXXNewContext:
	case DeclaratorContext::AliasDeclContext:
	case DeclaratorContext::AliasTemplateContext:
	case DeclaratorContext::MemberContext:
	case DeclaratorContext::BlockContext:
	case DeclaratorContext::ForContext:
	case DeclaratorContext::InitStmtContext:
	case DeclaratorContext::ConditionContext:
	case DeclaratorContext::CXXCatchContext:
	case DeclaratorContext::ObjCCatchContext:
	case DeclaratorContext::BlockLiteralContext:
	case DeclaratorContext::LambdaExprContext:
	case DeclaratorContext::ConversionIdContext:
	case DeclaratorContext::TrailingReturnContext:
	case DeclaratorContext::TrailingReturnVarContext:
	case DeclaratorContext::TemplateArgContext:
	case DeclaratorContext::TemplateTypeArgContext:
	// FIXME: We may want to allow parameter packs in block-literal contexts
	// in the future.
	S.Diag(D.getEllipsisLoc(),
	diag::err_ellipsis_in_declarator_not_parameter);
	D.setEllipsisLoc(SourceLocation());
	break;
	}
	}

	assert(!T.isNull() && "T must not be null at the end of this function");
	if (D.isInvalidType())
	return Context.getTrivialTypeSourceInfo(T);

	return GetTypeSourceInfoForDeclarator(state, T, TInfo);
	}

	/// GetTypeForDeclarator - Convert the type for the specified
	/// declarator to Type instances.
	///
	/// The result of this call will never be null, but the associated
	/// type may be a null type if there's an unrecoverable error.
	TypeSourceInfo Sema::GetTypeForDeclarator(Declarator &D, Scope S) {
	// Determine the type of the declarator. Not all forms of declarator
	// have a type.

	TypeProcessingState state(*this, D);

	TypeSourceInfo *ReturnTypeInfo = nullptr;
	QualType T = GetDeclSpecTypeForDeclarator(state, ReturnTypeInfo);
	if (D.isPrototypeContext() && getLangOpts().ObjCAutoRefCount)
	inferARCWriteback(state, T);

	return GetFullTypeForDeclarator(state, T, ReturnTypeInfo);
	}

	static void transferARCOwnershipToDeclSpec(Sema &S,
	QualType &declSpecTy,
	Qualifiers::ObjCLifetime ownership) {
	if (declSpecTy->isObjCRetainableType() &&
	declSpecTy.getObjCLifetime() == Qualifiers::OCL_None) {
	Qualifiers qs;
	qs.addObjCLifetime(ownership);
	declSpecTy = S.Context.getQualifiedType(declSpecTy, qs);
	}
	}

	static void transferARCOwnershipToDeclaratorChunk(TypeProcessingState &state,
	Qualifiers::ObjCLifetime ownership,
	unsigned chunkIndex) {
	Sema &S = state.getSema();
	Declarator &D = state.getDeclarator();

	// Look for an explicit lifetime attribute.
	DeclaratorChunk &chunk = D.getTypeObject(chunkIndex);
	if (chunk.getAttrs().hasAttribute(ParsedAttr::AT_ObjCOwnership))
	return;

	const char *attrStr = nullptr;
	switch (ownership) {
	case Qualifiers::OCL_None: llvm_unreachable("no ownership!");
	case Qualifiers::OCL_ExplicitNone: attrStr = "none"; break;
	case Qualifiers::OCL_Strong: attrStr = "strong"; break;
	case Qualifiers::OCL_Weak: attrStr = "weak"; break;
	case Qualifiers::OCL_Autoreleasing: attrStr = "autoreleasing"; break;
	}

	IdentifierLoc *Arg = new (S.Context) IdentifierLoc;
	Arg->Ident = &S.Context.Idents.get(attrStr);
	Arg->Loc = SourceLocation();

	ArgsUnion Args(Arg);

	// If there wasn't one, add one (with an invalid source location
	// so that we don't make an AttributedType for it).
	ParsedAttr *attr = D.getAttributePool().create(
	&S.Context.Idents.get("objc_ownership"), SourceLocation(),
	/scope/ nullptr, SourceLocation(),
	/args/ &Args, 1, ParsedAttr::AS_GNU);
	chunk.getAttrs().addAtEnd(attr);
	// TODO: mark whether we did this inference?
	}

	/// Used for transferring ownership in casts resulting in l-values.
	static void transferARCOwnership(TypeProcessingState &state,
	QualType &declSpecTy,
	Qualifiers::ObjCLifetime ownership) {
	Sema &S = state.getSema();
	Declarator &D = state.getDeclarator();

	int inner = -1;
	bool hasIndirection = false;
	for (unsigned i = 0, e = D.getNumTypeObjects(); i != e; ++i) {
	DeclaratorChunk &chunk = D.getTypeObject(i);
	switch (chunk.Kind) {
	case DeclaratorChunk::Paren:
	// Ignore parens.
	break;

	case DeclaratorChunk::Array:
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::Pointer:
	if (inner != -1)
	hasIndirection = true;
	inner = i;
	break;

	case DeclaratorChunk::BlockPointer:
	if (inner != -1)
	transferARCOwnershipToDeclaratorChunk(state, ownership, i);
	return;

	case DeclaratorChunk::Function:
	case DeclaratorChunk::MemberPointer:
	case DeclaratorChunk::Pipe:
	return;
	}
	}

	if (inner == -1)
	return;

	DeclaratorChunk &chunk = D.getTypeObject(inner);
	if (chunk.Kind == DeclaratorChunk::Pointer) {
	if (declSpecTy->isObjCRetainableType())
	return transferARCOwnershipToDeclSpec(S, declSpecTy, ownership);
	if (declSpecTy->isObjCObjectType() && hasIndirection)
	return transferARCOwnershipToDeclaratorChunk(state, ownership, inner);
	} else {
	assert(chunk.Kind == DeclaratorChunk::Array \|\|
	chunk.Kind == DeclaratorChunk::Reference);
	return transferARCOwnershipToDeclSpec(S, declSpecTy, ownership);
	}
	}

	TypeSourceInfo *Sema::GetTypeForDeclaratorCast(Declarator &D, QualType FromTy) {
	TypeProcessingState state(*this, D);

	TypeSourceInfo *ReturnTypeInfo = nullptr;
	QualType declSpecTy = GetDeclSpecTypeForDeclarator(state, ReturnTypeInfo);

	if (getLangOpts().ObjC) {
	Qualifiers::ObjCLifetime ownership = Context.getInnerObjCOwnership(FromTy);
	if (ownership != Qualifiers::OCL_None)
	transferARCOwnership(state, declSpecTy, ownership);
	}

	return GetFullTypeForDeclarator(state, declSpecTy, ReturnTypeInfo);
	}

	static void fillAttributedTypeLoc(AttributedTypeLoc TL,
	TypeProcessingState &State) {
	TL.setAttr(State.takeAttrForAttributedType(TL.getTypePtr()));
	}

	namespace {
	class TypeSpecLocFiller : public TypeLocVisitor<TypeSpecLocFiller> {
	ASTContext &Context;
	TypeProcessingState &State;
	const DeclSpec &DS;

	public:
	TypeSpecLocFiller(ASTContext &Context, TypeProcessingState &State,
	const DeclSpec &DS)
	: Context(Context), State(State), DS(DS) {}

	void VisitAttributedTypeLoc(AttributedTypeLoc TL) {
	Visit(TL.getModifiedLoc());
	fillAttributedTypeLoc(TL, State);
	}
	void VisitMacroQualifiedTypeLoc(MacroQualifiedTypeLoc TL) {
	Visit(TL.getInnerLoc());
	TL.setExpansionLoc(
	State.getExpansionLocForMacroQualifiedType(TL.getTypePtr()));
	}
	void VisitQualifiedTypeLoc(QualifiedTypeLoc TL) {
	Visit(TL.getUnqualifiedLoc());
	}
	void VisitTypedefTypeLoc(TypedefTypeLoc TL) {
	TL.setNameLoc(DS.getTypeSpecTypeLoc());
	}
	void VisitObjCInterfaceTypeLoc(ObjCInterfaceTypeLoc TL) {
	TL.setNameLoc(DS.getTypeSpecTypeLoc());
	// FIXME. We should have DS.getTypeSpecTypeEndLoc(). But, it requires
	// addition field. What we have is good enough for dispay of location
	// of 'fixit' on interface name.
	TL.setNameEndLoc(DS.getEndLoc());
	}
	void VisitObjCObjectTypeLoc(ObjCObjectTypeLoc TL) {
	TypeSourceInfo *RepTInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &RepTInfo);
	TL.copy(RepTInfo->getTypeLoc());
	}
	void VisitObjCObjectPointerTypeLoc(ObjCObjectPointerTypeLoc TL) {
	TypeSourceInfo *RepTInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &RepTInfo);
	TL.copy(RepTInfo->getTypeLoc());
	}
	void VisitTemplateSpecializationTypeLoc(TemplateSpecializationTypeLoc TL) {
	TypeSourceInfo *TInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);

	// If we got no declarator info from previous Sema routines,
	// just fill with the typespec loc.
	if (!TInfo) {
	TL.initialize(Context, DS.getTypeSpecTypeNameLoc());
	return;
	}

	TypeLoc OldTL = TInfo->getTypeLoc();
	if (TInfo->getType()->getAs<ElaboratedType>()) {
	ElaboratedTypeLoc ElabTL = OldTL.castAs<ElaboratedTypeLoc>();
	TemplateSpecializationTypeLoc NamedTL = ElabTL.getNamedTypeLoc()
	.castAs<TemplateSpecializationTypeLoc>();
	TL.copy(NamedTL);
	} else {
	TL.copy(OldTL.castAs<TemplateSpecializationTypeLoc>());
	assert(TL.getRAngleLoc() == OldTL.castAs<TemplateSpecializationTypeLoc>().getRAngleLoc());
	}

	}
	void VisitTypeOfExprTypeLoc(TypeOfExprTypeLoc TL) {
	assert(DS.getTypeSpecType() == DeclSpec::TST_typeofExpr);
	TL.setTypeofLoc(DS.getTypeSpecTypeLoc());
	TL.setParensRange(DS.getTypeofParensRange());
	}
	void VisitTypeOfTypeLoc(TypeOfTypeLoc TL) {
	assert(DS.getTypeSpecType() == DeclSpec::TST_typeofType);
	TL.setTypeofLoc(DS.getTypeSpecTypeLoc());
	TL.setParensRange(DS.getTypeofParensRange());
	assert(DS.getRepAsType());
	TypeSourceInfo *TInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
	TL.setUnderlyingTInfo(TInfo);
	}
	void VisitUnaryTransformTypeLoc(UnaryTransformTypeLoc TL) {
	// FIXME: This holds only because we only have one unary transform.
	assert(DS.getTypeSpecType() == DeclSpec::TST_underlyingType);
	TL.setKWLoc(DS.getTypeSpecTypeLoc());
	TL.setParensRange(DS.getTypeofParensRange());
	assert(DS.getRepAsType());
	TypeSourceInfo *TInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
	TL.setUnderlyingTInfo(TInfo);
	}
	void VisitBuiltinTypeLoc(BuiltinTypeLoc TL) {
	// By default, use the source location of the type specifier.
	TL.setBuiltinLoc(DS.getTypeSpecTypeLoc());
	if (TL.needsExtraLocalData()) {
	// Set info for the written builtin specifiers.
	TL.getWrittenBuiltinSpecs() = DS.getWrittenBuiltinSpecs();
	// Try to have a meaningful source location.
	if (TL.getWrittenSignSpec() != TSS_unspecified)
	TL.expandBuiltinRange(DS.getTypeSpecSignLoc());
	if (TL.getWrittenWidthSpec() != TSW_unspecified)
	TL.expandBuiltinRange(DS.getTypeSpecWidthRange());
	}
	}
	void VisitElaboratedTypeLoc(ElaboratedTypeLoc TL) {
	ElaboratedTypeKeyword Keyword
	= TypeWithKeyword::getKeywordForTypeSpec(DS.getTypeSpecType());
	if (DS.getTypeSpecType() == TST_typename) {
	TypeSourceInfo *TInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
	if (TInfo) {
	TL.copy(TInfo->getTypeLoc().castAs<ElaboratedTypeLoc>());
	return;
	}
	}
	TL.setElaboratedKeywordLoc(Keyword != ETK_None
	? DS.getTypeSpecTypeLoc()
	: SourceLocation());
	const CXXScopeSpec& SS = DS.getTypeSpecScope();
	TL.setQualifierLoc(SS.getWithLocInContext(Context));
	Visit(TL.getNextTypeLoc().getUnqualifiedLoc());
	}
	void VisitDependentNameTypeLoc(DependentNameTypeLoc TL) {
	assert(DS.getTypeSpecType() == TST_typename);
	TypeSourceInfo *TInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
	assert(TInfo);
	TL.copy(TInfo->getTypeLoc().castAs<DependentNameTypeLoc>());
	}
	void VisitDependentTemplateSpecializationTypeLoc(
	DependentTemplateSpecializationTypeLoc TL) {
	assert(DS.getTypeSpecType() == TST_typename);
	TypeSourceInfo *TInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
	assert(TInfo);
	TL.copy(
	TInfo->getTypeLoc().castAs<DependentTemplateSpecializationTypeLoc>());
	}
	void VisitTagTypeLoc(TagTypeLoc TL) {
	TL.setNameLoc(DS.getTypeSpecTypeNameLoc());
	}
	void VisitAtomicTypeLoc(AtomicTypeLoc TL) {
	// An AtomicTypeLoc can come from either an _Atomic(...) type specifier
	// or an _Atomic qualifier.
	if (DS.getTypeSpecType() == DeclSpec::TST_atomic) {
	TL.setKWLoc(DS.getTypeSpecTypeLoc());
	TL.setParensRange(DS.getTypeofParensRange());

	TypeSourceInfo *TInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
	assert(TInfo);
	TL.getValueLoc().initializeFullCopy(TInfo->getTypeLoc());
	} else {
	TL.setKWLoc(DS.getAtomicSpecLoc());
	// No parens, to indicate this was spelled as an _Atomic qualifier.
	TL.setParensRange(SourceRange());
	Visit(TL.getValueLoc());
	}
	}

	void VisitPipeTypeLoc(PipeTypeLoc TL) {
	TL.setKWLoc(DS.getTypeSpecTypeLoc());

	TypeSourceInfo *TInfo = nullptr;
	Sema::GetTypeFromParser(DS.getRepAsType(), &TInfo);
	TL.getValueLoc().initializeFullCopy(TInfo->getTypeLoc());
	}

	void VisitTypeLoc(TypeLoc TL) {
	// FIXME: add other typespec types and change this to an assert.
	TL.initialize(Context, DS.getTypeSpecTypeLoc());
	}
	};

	class DeclaratorLocFiller : public TypeLocVisitor<DeclaratorLocFiller> {
	ASTContext &Context;
	TypeProcessingState &State;
	const DeclaratorChunk &Chunk;

	public:
	DeclaratorLocFiller(ASTContext &Context, TypeProcessingState &State,
	const DeclaratorChunk &Chunk)
	: Context(Context), State(State), Chunk(Chunk) {}

	void VisitQualifiedTypeLoc(QualifiedTypeLoc TL) {
	llvm_unreachable("qualified type locs not expected here!");
	}
	void VisitDecayedTypeLoc(DecayedTypeLoc TL) {
	llvm_unreachable("decayed type locs not expected here!");
	}

	void VisitAttributedTypeLoc(AttributedTypeLoc TL) {
	fillAttributedTypeLoc(TL, State);
	}
	void VisitAdjustedTypeLoc(AdjustedTypeLoc TL) {
	// nothing
	}
	void VisitBlockPointerTypeLoc(BlockPointerTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::BlockPointer);
	TL.setCaretLoc(Chunk.Loc);
	}
	void VisitPointerTypeLoc(PointerTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::Pointer);
	TL.setStarLoc(Chunk.Loc);
	}
	void VisitObjCObjectPointerTypeLoc(ObjCObjectPointerTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::Pointer);
	TL.setStarLoc(Chunk.Loc);
	}
	void VisitMemberPointerTypeLoc(MemberPointerTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::MemberPointer);
	const CXXScopeSpec& SS = Chunk.Mem.Scope();
	NestedNameSpecifierLoc NNSLoc = SS.getWithLocInContext(Context);

	const Type* ClsTy = TL.getClass();
	QualType ClsQT = QualType(ClsTy, 0);
	TypeSourceInfo *ClsTInfo = Context.CreateTypeSourceInfo(ClsQT, 0);
	// Now copy source location info into the type loc component.
	TypeLoc ClsTL = ClsTInfo->getTypeLoc();
	switch (NNSLoc.getNestedNameSpecifier()->getKind()) {
	case NestedNameSpecifier::Identifier:
	assert(isa<DependentNameType>(ClsTy) && "Unexpected TypeLoc");
	{
	DependentNameTypeLoc DNTLoc = ClsTL.castAs<DependentNameTypeLoc>();
	DNTLoc.setElaboratedKeywordLoc(SourceLocation());
	DNTLoc.setQualifierLoc(NNSLoc.getPrefix());
	DNTLoc.setNameLoc(NNSLoc.getLocalBeginLoc());
	}
	break;

	case NestedNameSpecifier::TypeSpec:
	case NestedNameSpecifier::TypeSpecWithTemplate:
	if (isa<ElaboratedType>(ClsTy)) {
	ElaboratedTypeLoc ETLoc = ClsTL.castAs<ElaboratedTypeLoc>();
	ETLoc.setElaboratedKeywordLoc(SourceLocation());
	ETLoc.setQualifierLoc(NNSLoc.getPrefix());
	TypeLoc NamedTL = ETLoc.getNamedTypeLoc();
	NamedTL.initializeFullCopy(NNSLoc.getTypeLoc());
	} else {
	ClsTL.initializeFullCopy(NNSLoc.getTypeLoc());
	}
	break;

	case NestedNameSpecifier::Namespace:
	case NestedNameSpecifier::NamespaceAlias:
	case NestedNameSpecifier::Global:
	case NestedNameSpecifier::Super:
	llvm_unreachable("Nested-name-specifier must name a type");
	}

	// Finally fill in MemberPointerLocInfo fields.
	TL.setStarLoc(Chunk.Loc);
	TL.setClassTInfo(ClsTInfo);
	}
	void VisitLValueReferenceTypeLoc(LValueReferenceTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::Reference);
	// 'Amp' is misleading: this might have been originally
	/// spelled with AmpAmp.
	TL.setAmpLoc(Chunk.Loc);
	}
	void VisitRValueReferenceTypeLoc(RValueReferenceTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::Reference);
	assert(!Chunk.Ref.LValueRef);
	TL.setAmpAmpLoc(Chunk.Loc);
	}
	void VisitArrayTypeLoc(ArrayTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::Array);
	TL.setLBracketLoc(Chunk.Loc);
	TL.setRBracketLoc(Chunk.EndLoc);
	TL.setSizeExpr(static_cast<Expr*>(Chunk.Arr.NumElts));
	}
	void VisitFunctionTypeLoc(FunctionTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::Function);
	TL.setLocalRangeBegin(Chunk.Loc);
	TL.setLocalRangeEnd(Chunk.EndLoc);

	const DeclaratorChunk::FunctionTypeInfo &FTI = Chunk.Fun;
	TL.setLParenLoc(FTI.getLParenLoc());
	TL.setRParenLoc(FTI.getRParenLoc());
	for (unsigned i = 0, e = TL.getNumParams(), tpi = 0; i != e; ++i) {
	ParmVarDecl *Param = cast<ParmVarDecl>(FTI.Params[i].Param);
	TL.setParam(tpi++, Param);
	}
	TL.setExceptionSpecRange(FTI.getExceptionSpecRange());
	}
	void VisitParenTypeLoc(ParenTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::Paren);
	TL.setLParenLoc(Chunk.Loc);
	TL.setRParenLoc(Chunk.EndLoc);
	}
	void VisitPipeTypeLoc(PipeTypeLoc TL) {
	assert(Chunk.Kind == DeclaratorChunk::Pipe);
	TL.setKWLoc(Chunk.Loc);
	}
	void VisitMacroQualifiedTypeLoc(MacroQualifiedTypeLoc TL) {
	TL.setExpansionLoc(Chunk.Loc);
	}

	void VisitTypeLoc(TypeLoc TL) {
	llvm_unreachable("unsupported TypeLoc kind in declarator!");
	}
	};
	} // end anonymous namespace

	static void fillAtomicQualLoc(AtomicTypeLoc ATL, const DeclaratorChunk &Chunk) {
	SourceLocation Loc;
	switch (Chunk.Kind) {
	case DeclaratorChunk::Function:
	case DeclaratorChunk::Array:
	case DeclaratorChunk::Paren:
	case DeclaratorChunk::Pipe:
	llvm_unreachable("cannot be _Atomic qualified");

	case DeclaratorChunk::Pointer:
	Loc = SourceLocation::getFromRawEncoding(Chunk.Ptr.AtomicQualLoc);
	break;

	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::MemberPointer:
	// FIXME: Provide a source location for the _Atomic keyword.
	break;
	}

	ATL.setKWLoc(Loc);
	ATL.setParensRange(SourceRange());
	}

	static void
	fillDependentAddressSpaceTypeLoc(DependentAddressSpaceTypeLoc DASTL,
	const ParsedAttributesView &Attrs) {
	for (const ParsedAttr &AL : Attrs) {
	if (AL.getKind() == ParsedAttr::AT_AddressSpace) {
	DASTL.setAttrNameLoc(AL.getLoc());
	DASTL.setAttrExprOperand(AL.getArgAsExpr(0));
	DASTL.setAttrOperandParensRange(SourceRange());
	return;
	}
	}

	llvm_unreachable(
	"no address_space attribute found at the expected location!");
	}

	/// Create and instantiate a TypeSourceInfo with type source information.
	///
	/// \param T QualType referring to the type as written in source code.
	///
	/// \param ReturnTypeInfo For declarators whose return type does not show
	/// up in the normal place in the declaration specifiers (such as a C++
	/// conversion function), this pointer will refer to a type source information
	/// for that return type.
	static TypeSourceInfo *
	GetTypeSourceInfoForDeclarator(TypeProcessingState &State,
	QualType T, TypeSourceInfo *ReturnTypeInfo) {
	Sema &S = State.getSema();
	Declarator &D = State.getDeclarator();

	TypeSourceInfo *TInfo = S.Context.CreateTypeSourceInfo(T);
	UnqualTypeLoc CurrTL = TInfo->getTypeLoc().getUnqualifiedLoc();

	// Handle parameter packs whose type is a pack expansion.
	if (isa<PackExpansionType>(T)) {
	CurrTL.castAs<PackExpansionTypeLoc>().setEllipsisLoc(D.getEllipsisLoc());
	CurrTL = CurrTL.getNextTypeLoc().getUnqualifiedLoc();
	}

	for (unsigned i = 0, e = D.getNumTypeObjects(); i != e; ++i) {
	// An AtomicTypeLoc might be produced by an atomic qualifier in this
	// declarator chunk.
	if (AtomicTypeLoc ATL = CurrTL.getAs<AtomicTypeLoc>()) {
	fillAtomicQualLoc(ATL, D.getTypeObject(i));
	CurrTL = ATL.getValueLoc().getUnqualifiedLoc();
	}

	while (MacroQualifiedTypeLoc TL = CurrTL.getAs<MacroQualifiedTypeLoc>()) {
	TL.setExpansionLoc(
	State.getExpansionLocForMacroQualifiedType(TL.getTypePtr()));
	CurrTL = TL.getNextTypeLoc().getUnqualifiedLoc();
	}

	while (AttributedTypeLoc TL = CurrTL.getAs<AttributedTypeLoc>()) {
	fillAttributedTypeLoc(TL, State);
	CurrTL = TL.getNextTypeLoc().getUnqualifiedLoc();
	}

	while (DependentAddressSpaceTypeLoc TL =
	CurrTL.getAs<DependentAddressSpaceTypeLoc>()) {
	fillDependentAddressSpaceTypeLoc(TL, D.getTypeObject(i).getAttrs());
	CurrTL = TL.getPointeeTypeLoc().getUnqualifiedLoc();
	}

	// FIXME: Ordering here?
	while (AdjustedTypeLoc TL = CurrTL.getAs<AdjustedTypeLoc>())
	CurrTL = TL.getNextTypeLoc().getUnqualifiedLoc();

	DeclaratorLocFiller(S.Context, State, D.getTypeObject(i)).Visit(CurrTL);
	CurrTL = CurrTL.getNextTypeLoc().getUnqualifiedLoc();
	}

	// If we have different source information for the return type, use
	// that. This really only applies to C++ conversion functions.
	if (ReturnTypeInfo) {
	TypeLoc TL = ReturnTypeInfo->getTypeLoc();
	assert(TL.getFullDataSize() == CurrTL.getFullDataSize());
	memcpy(CurrTL.getOpaqueData(), TL.getOpaqueData(), TL.getFullDataSize());
	} else {
	TypeSpecLocFiller(S.Context, State, D.getDeclSpec()).Visit(CurrTL);
	}

	return TInfo;
	}

	/// Create a LocInfoType to hold the given QualType and TypeSourceInfo.
	ParsedType Sema::CreateParsedType(QualType T, TypeSourceInfo *TInfo) {
	// FIXME: LocInfoTypes are "transient", only needed for passing to/from Parser
	// and Sema during declaration parsing. Try deallocating/caching them when
	// it's appropriate, instead of allocating them and keeping them around.
	LocInfoType LocT = (LocInfoType)BumpAlloc.Allocate(sizeof(LocInfoType),
	TypeAlignment);
	new (LocT) LocInfoType(T, TInfo);
	assert(LocT->getTypeClass() != T->getTypeClass() &&
	"LocInfoType's TypeClass conflicts with an existing Type class");
	return ParsedType::make(QualType(LocT, 0));
	}

	void LocInfoType::getAsStringInternal(std::string &Str,
	const PrintingPolicy &Policy) const {
	llvm_unreachable("LocInfoType leaked into the type system; an opaque TypeTy*"
	" was used directly instead of getting the QualType through"
	" GetTypeFromParser");
	}

	TypeResult Sema::ActOnTypeName(Scope *S, Declarator &D) {
	// C99 6.7.6: Type names have no identifier. This is already validated by
	// the parser.
	assert(D.getIdentifier() == nullptr &&
	"Type name should have no identifier!");

	TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
	QualType T = TInfo->getType();
	if (D.isInvalidType())
	return true;

	// Make sure there are no unused decl attributes on the declarator.
	// We don't want to do this for ObjC parameters because we're going
	// to apply them to the actual parameter declaration.
	// Likewise, we don't want to do this for alias declarations, because
	// we are actually going to build a declaration from this eventually.
	if (D.getContext() != DeclaratorContext::ObjCParameterContext &&
	D.getContext() != DeclaratorContext::AliasDeclContext &&
	D.getContext() != DeclaratorContext::AliasTemplateContext)
	checkUnusedDeclAttributes(D);

	if (getLangOpts().CPlusPlus) {
	// Check that there are no default arguments (C++ only).
	CheckExtraCXXDefaultArguments(D);
	}

	return CreateParsedType(T, TInfo);
	}

	ParsedType Sema::ActOnObjCInstanceType(SourceLocation Loc) {
	QualType T = Context.getObjCInstanceType();
	TypeSourceInfo *TInfo = Context.getTrivialTypeSourceInfo(T, Loc);
	return CreateParsedType(T, TInfo);
	}

	//===----------------------------------------------------------------------===//
	// Type Attribute Processing
	//===----------------------------------------------------------------------===//

	/// Build an AddressSpace index from a constant expression and diagnose any
	/// errors related to invalid address_spaces. Returns true on successfully
	/// building an AddressSpace index.
	static bool BuildAddressSpaceIndex(Sema &S, LangAS &ASIdx,
	const Expr *AddrSpace,
	SourceLocation AttrLoc) {
	if (!AddrSpace->isValueDependent()) {
	llvm::APSInt addrSpace(32);
	if (!AddrSpace->isIntegerConstantExpr(addrSpace, S.Context)) {
	S.Diag(AttrLoc, diag::err_attribute_argument_type)
	<< "'address_space'" << AANT_ArgumentIntegerConstant
	<< AddrSpace->getSourceRange();
	return false;
	}

	// Bounds checking.
	if (addrSpace.isSigned()) {
	if (addrSpace.isNegative()) {
	S.Diag(AttrLoc, diag::err_attribute_address_space_negative)
	<< AddrSpace->getSourceRange();
	return false;
	}
	addrSpace.setIsSigned(false);
	}

	llvm::APSInt max(addrSpace.getBitWidth());
	max =
	Qualifiers::MaxAddressSpace - (unsigned)LangAS::FirstTargetAddressSpace;
	if (addrSpace > max) {
	S.Diag(AttrLoc, diag::err_attribute_address_space_too_high)
	<< (unsigned)max.getZExtValue() << AddrSpace->getSourceRange();
	return false;
	}

	ASIdx =
	getLangASFromTargetAS(static_cast<unsigned>(addrSpace.getZExtValue()));
	return true;
	}

	// Default value for DependentAddressSpaceTypes
	ASIdx = LangAS::Default;
	return true;
	}

	/// BuildAddressSpaceAttr - Builds a DependentAddressSpaceType if an expression
	/// is uninstantiated. If instantiated it will apply the appropriate address
	/// space to the type. This function allows dependent template variables to be
	/// used in conjunction with the address_space attribute
	QualType Sema::BuildAddressSpaceAttr(QualType &T, LangAS ASIdx, Expr *AddrSpace,
	SourceLocation AttrLoc) {
	if (!AddrSpace->isValueDependent()) {
	if (DiagnoseMultipleAddrSpaceAttributes(*this, T.getAddressSpace(), ASIdx,
	AttrLoc))
	return QualType();

	return Context.getAddrSpaceQualType(T, ASIdx);
	}

	// A check with similar intentions as checking if a type already has an
	// address space except for on a dependent types, basically if the
	// current type is already a DependentAddressSpaceType then its already
	// lined up to have another address space on it and we can't have
	// multiple address spaces on the one pointer indirection
	if (T->getAs<DependentAddressSpaceType>()) {
	Diag(AttrLoc, diag::err_attribute_address_multiple_qualifiers);
	return QualType();
	}

	return Context.getDependentAddressSpaceType(T, AddrSpace, AttrLoc);
	}

	QualType Sema::BuildAddressSpaceAttr(QualType &T, Expr *AddrSpace,
	SourceLocation AttrLoc) {
	LangAS ASIdx;
	if (!BuildAddressSpaceIndex(*this, ASIdx, AddrSpace, AttrLoc))
	return QualType();
	return BuildAddressSpaceAttr(T, ASIdx, AddrSpace, AttrLoc);
	}

	/// HandleAddressSpaceTypeAttribute - Process an address_space attribute on the
	/// specified type. The attribute contains 1 argument, the id of the address
	/// space for the type.
	static void HandleAddressSpaceTypeAttribute(QualType &Type,
	const ParsedAttr &Attr,
	TypeProcessingState &State) {
	Sema &S = State.getSema();

	// ISO/IEC TR 18037 S5.3 (amending C99 6.7.3): "A function type shall not be
	// qualified by an address-space qualifier."
	if (Type->isFunctionType()) {
	S.Diag(Attr.getLoc(), diag::err_attribute_address_function_type);
	Attr.setInvalid();
	return;
	}

	LangAS ASIdx;
	if (Attr.getKind() == ParsedAttr::AT_AddressSpace) {

	// Check the attribute arguments.
	if (Attr.getNumArgs() != 1) {
	S.Diag(Attr.getLoc(), diag::err_attribute_wrong_number_arguments) << Attr
	<< 1;
	Attr.setInvalid();
	return;
	}

	Expr *ASArgExpr;
	if (Attr.isArgIdent(0)) {
	// Special case where the argument is a template id.
	CXXScopeSpec SS;
	SourceLocation TemplateKWLoc;
	UnqualifiedId id;
	id.setIdentifier(Attr.getArgAsIdent(0)->Ident, Attr.getLoc());

	ExprResult AddrSpace = S.ActOnIdExpression(
	S.getCurScope(), SS, TemplateKWLoc, id, /HasTrailingLParen=/false,
	/IsAddressOfOperand=/false);
	if (AddrSpace.isInvalid())
	return;

	ASArgExpr = static_cast<Expr *>(AddrSpace.get());
	} else {
	ASArgExpr = static_cast<Expr *>(Attr.getArgAsExpr(0));
	}

	LangAS ASIdx;
	if (!BuildAddressSpaceIndex(S, ASIdx, ASArgExpr, Attr.getLoc())) {
	Attr.setInvalid();
	return;
	}

	ASTContext &Ctx = S.Context;
	auto *ASAttr = ::new (Ctx) AddressSpaceAttr(
	Attr.getRange(), Ctx, Attr.getAttributeSpellingListIndex(),
	static_cast<unsigned>(ASIdx));

	// If the expression is not value dependent (not templated), then we can
	// apply the address space qualifiers just to the equivalent type.
	// Otherwise, we make an AttributedType with the modified and equivalent
	// type the same, and wrap it in a DependentAddressSpaceType. When this
	// dependent type is resolved, the qualifier is added to the equivalent type
	// later.
	QualType T;
	if (!ASArgExpr->isValueDependent()) {
	QualType EquivType =
	S.BuildAddressSpaceAttr(Type, ASIdx, ASArgExpr, Attr.getLoc());
	if (EquivType.isNull()) {
	Attr.setInvalid();
	return;
	}
	T = State.getAttributedType(ASAttr, Type, EquivType);
	} else {
	T = State.getAttributedType(ASAttr, Type, Type);
	T = S.BuildAddressSpaceAttr(T, ASIdx, ASArgExpr, Attr.getLoc());
	}

	if (!T.isNull())
	Type = T;
	else
	Attr.setInvalid();
	} else {
	// The keyword-based type attributes imply which address space to use.
	ASIdx = Attr.asOpenCLLangAS();
	if (ASIdx == LangAS::Default)
	llvm_unreachable("Invalid address space");

	if (DiagnoseMultipleAddrSpaceAttributes(S, Type.getAddressSpace(), ASIdx,
	Attr.getLoc())) {
	Attr.setInvalid();
	return;
	}

	Type = S.Context.getAddrSpaceQualType(Type, ASIdx);
	}
	}

	/// Does this type have a "direct" ownership qualifier? That is,
	/// is it written like "__strong id", as opposed to something like
	/// "typeof(foo)", where that happens to be strong?
	static bool hasDirectOwnershipQualifier(QualType type) {
	// Fast path: no qualifier at all.
	assert(type.getQualifiers().hasObjCLifetime());

	while (true) {
	// __strong id
	if (const AttributedType *attr = dyn_cast<AttributedType>(type)) {
	if (attr->getAttrKind() == attr::ObjCOwnership)
	return true;

	type = attr->getModifiedType();

	// X *__strong (...)
	} else if (const ParenType *paren = dyn_cast<ParenType>(type)) {
	type = paren->getInnerType();

	// That's it for things we want to complain about. In particular,
	// we do not want to look through typedefs, typeof(expr),
	// typeof(type), or any other way that the type is somehow
	// abstracted.
	} else {

	return false;
	}
	}
	}

	/// handleObjCOwnershipTypeAttr - Process an objc_ownership
	/// attribute on the specified type.
	///
	/// Returns 'true' if the attribute was handled.
	static bool handleObjCOwnershipTypeAttr(TypeProcessingState &state,
	ParsedAttr &attr, QualType &type) {
	bool NonObjCPointer = false;

	if (!type->isDependentType() && !type->isUndeducedType()) {
	if (const PointerType *ptr = type->getAs<PointerType>()) {
	QualType pointee = ptr->getPointeeType();
	if (pointee->isObjCRetainableType() \|\| pointee->isPointerType())
	return false;
	// It is important not to lose the source info that there was an attribute
	// applied to non-objc pointer. We will create an attributed type but
	// its type will be the same as the original type.
	NonObjCPointer = true;
	} else if (!type->isObjCRetainableType()) {
	return false;
	}

	// Don't accept an ownership attribute in the declspec if it would
	// just be the return type of a block pointer.
	if (state.isProcessingDeclSpec()) {
	Declarator &D = state.getDeclarator();
	if (maybeMovePastReturnType(D, D.getNumTypeObjects(),
	/onlyBlockPointers=/true))
	return false;
	}
	}

	Sema &S = state.getSema();
	SourceLocation AttrLoc = attr.getLoc();
	if (AttrLoc.isMacroID())
	AttrLoc =
	S.getSourceManager().getImmediateExpansionRange(AttrLoc).getBegin();

	if (!attr.isArgIdent(0)) {
	S.Diag(AttrLoc, diag::err_attribute_argument_type) << attr
	<< AANT_ArgumentString;
	attr.setInvalid();
	return true;
	}

	IdentifierInfo *II = attr.getArgAsIdent(0)->Ident;
	Qualifiers::ObjCLifetime lifetime;
	if (II->isStr("none"))
	lifetime = Qualifiers::OCL_ExplicitNone;
	else if (II->isStr("strong"))
	lifetime = Qualifiers::OCL_Strong;
	else if (II->isStr("weak"))
	lifetime = Qualifiers::OCL_Weak;
	else if (II->isStr("autoreleasing"))
	lifetime = Qualifiers::OCL_Autoreleasing;
	else {
	S.Diag(AttrLoc, diag::warn_attribute_type_not_supported)
	<< attr.getName() << II;
	attr.setInvalid();
	return true;
	}

	// Just ignore lifetime attributes other than __weak and __unsafe_unretained
	// outside of ARC mode.
	if (!S.getLangOpts().ObjCAutoRefCount &&
	lifetime != Qualifiers::OCL_Weak &&
	lifetime != Qualifiers::OCL_ExplicitNone) {
	return true;
	}

	SplitQualType underlyingType = type.split();

	// Check for redundant/conflicting ownership qualifiers.
	if (Qualifiers::ObjCLifetime previousLifetime
	= type.getQualifiers().getObjCLifetime()) {
	// If it's written directly, that's an error.
	if (hasDirectOwnershipQualifier(type)) {
	S.Diag(AttrLoc, diag::err_attr_objc_ownership_redundant)
	<< type;
	return true;
	}

	// Otherwise, if the qualifiers actually conflict, pull sugar off
	// and remove the ObjCLifetime qualifiers.
	if (previousLifetime != lifetime) {
	// It's possible to have multiple local ObjCLifetime qualifiers. We
	// can't stop after we reach a type that is directly qualified.
	const Type *prevTy = nullptr;
	while (!prevTy \|\| prevTy != underlyingType.Ty) {
	prevTy = underlyingType.Ty;
	underlyingType = underlyingType.getSingleStepDesugaredType();
	}
	underlyingType.Quals.removeObjCLifetime();
	}
	}

	underlyingType.Quals.addObjCLifetime(lifetime);

	if (NonObjCPointer) {
	StringRef name = attr.getName()->getName();
	switch (lifetime) {
	case Qualifiers::OCL_None:
	case Qualifiers::OCL_ExplicitNone:
	break;
	case Qualifiers::OCL_Strong: name = "__strong"; break;
	case Qualifiers::OCL_Weak: name = "__weak"; break;
	case Qualifiers::OCL_Autoreleasing: name = "__autoreleasing"; break;
	}
	S.Diag(AttrLoc, diag::warn_type_attribute_wrong_type) << name
	<< TDS_ObjCObjOrBlock << type;
	}

	// Don't actually add the __unsafe_unretained qualifier in non-ARC files,
	// because having both 'T' and '__unsafe_unretained T' exist in the type
	// system causes unfortunate widespread consistency problems. (For example,
	// they're not considered compatible types, and we mangle them identicially
	// as template arguments.) These problems are all individually fixable,
	// but it's easier to just not add the qualifier and instead sniff it out
	// in specific places using isObjCInertUnsafeUnretainedType().
	//
	// Doing this does means we miss some trivial consistency checks that
	// would've triggered in ARC, but that's better than trying to solve all
	// the coexistence problems with __unsafe_unretained.
	if (!S.getLangOpts().ObjCAutoRefCount &&
	lifetime == Qualifiers::OCL_ExplicitNone) {
	type = state.getAttributedType(
	createSimpleAttr<ObjCInertUnsafeUnretainedAttr>(S.Context, attr),
	type, type);
	return true;
	}

	QualType origType = type;
	if (!NonObjCPointer)
	type = S.Context.getQualifiedType(underlyingType);

	// If we have a valid source location for the attribute, use an
	// AttributedType instead.
	if (AttrLoc.isValid()) {
	type = state.getAttributedType(::new (S.Context) ObjCOwnershipAttr(
	attr.getRange(), S.Context, II,
	attr.getAttributeSpellingListIndex()),
	origType, type);
	}

	auto diagnoseOrDelay = [](Sema &S, SourceLocation loc,
	unsigned diagnostic, QualType type) {
	if (S.DelayedDiagnostics.shouldDelayDiagnostics()) {
	S.DelayedDiagnostics.add(
	sema::DelayedDiagnostic::makeForbiddenType(
	S.getSourceManager().getExpansionLoc(loc),
	diagnostic, type, /ignored/ 0));
	} else {
	S.Diag(loc, diagnostic);
	}
	};

	// Sometimes, __weak isn't allowed.
	if (lifetime == Qualifiers::OCL_Weak &&
	!S.getLangOpts().ObjCWeak && !NonObjCPointer) {

	// Use a specialized diagnostic if the runtime just doesn't support them.
	unsigned diagnostic =
	(S.getLangOpts().ObjCWeakRuntime ? diag::err_arc_weak_disabled
	: diag::err_arc_weak_no_runtime);

	// In any case, delay the diagnostic until we know what we're parsing.
	diagnoseOrDelay(S, AttrLoc, diagnostic, type);

	attr.setInvalid();
	return true;
	}

	// Forbid __weak for class objects marked as
	// objc_arc_weak_reference_unavailable
	if (lifetime == Qualifiers::OCL_Weak) {
	if (const ObjCObjectPointerType *ObjT =
	type->getAs<ObjCObjectPointerType>()) {
	if (ObjCInterfaceDecl *Class = ObjT->getInterfaceDecl()) {
	if (Class->isArcWeakrefUnavailable()) {
	S.Diag(AttrLoc, diag::err_arc_unsupported_weak_class);
	S.Diag(ObjT->getInterfaceDecl()->getLocation(),
	diag::note_class_declared);
	}
	}
	}
	}

	return true;
	}

	/// handleObjCGCTypeAttr - Process the __attribute__((objc_gc)) type
	/// attribute on the specified type. Returns true to indicate that
	/// the attribute was handled, false to indicate that the type does
	/// not permit the attribute.
	static bool handleObjCGCTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
	QualType &type) {
	Sema &S = state.getSema();

	// Delay if this isn't some kind of pointer.
	if (!type->isPointerType() &&
	!type->isObjCObjectPointerType() &&
	!type->isBlockPointerType())
	return false;

	if (type.getObjCGCAttr() != Qualifiers::GCNone) {
	S.Diag(attr.getLoc(), diag::err_attribute_multiple_objc_gc);
	attr.setInvalid();
	return true;
	}

	// Check the attribute arguments.
	if (!attr.isArgIdent(0)) {
	S.Diag(attr.getLoc(), diag::err_attribute_argument_type)
	<< attr << AANT_ArgumentString;
	attr.setInvalid();
	return true;
	}
	Qualifiers::GC GCAttr;
	if (attr.getNumArgs() > 1) {
	S.Diag(attr.getLoc(), diag::err_attribute_wrong_number_arguments) << attr
	<< 1;
	attr.setInvalid();
	return true;
	}

	IdentifierInfo *II = attr.getArgAsIdent(0)->Ident;
	if (II->isStr("weak"))
	GCAttr = Qualifiers::Weak;
	else if (II->isStr("strong"))
	GCAttr = Qualifiers::Strong;
	else {
	S.Diag(attr.getLoc(), diag::warn_attribute_type_not_supported)
	<< attr.getName() << II;
	attr.setInvalid();
	return true;
	}

	QualType origType = type;
	type = S.Context.getObjCGCQualType(origType, GCAttr);

	// Make an attributed type to preserve the source information.
	if (attr.getLoc().isValid())
	type = state.getAttributedType(
	::new (S.Context) ObjCGCAttr(attr.getRange(), S.Context, II,
	attr.getAttributeSpellingListIndex()),
	origType, type);

	return true;
	}

	namespace {
	/// A helper class to unwrap a type down to a function for the
	/// purposes of applying attributes there.
	///
	/// Use:
	/// FunctionTypeUnwrapper unwrapped(SemaRef, T);
	/// if (unwrapped.isFunctionType()) {
	/// const FunctionType *fn = unwrapped.get();
	/// // change fn somehow
	/// T = unwrapped.wrap(fn);
	/// }
	struct FunctionTypeUnwrapper {
	enum WrapKind {
	Desugar,
	Attributed,
	Parens,
	Pointer,
	BlockPointer,
	Reference,
	MemberPointer
	};

	QualType Original;
	const FunctionType *Fn;
	SmallVector<unsigned char /WrapKind/, 8> Stack;

	FunctionTypeUnwrapper(Sema &S, QualType T) : Original(T) {
	while (true) {
	const Type *Ty = T.getTypePtr();
	if (isa<FunctionType>(Ty)) {
	Fn = cast<FunctionType>(Ty);
	return;
	} else if (isa<ParenType>(Ty)) {
	T = cast<ParenType>(Ty)->getInnerType();
	Stack.push_back(Parens);
	} else if (isa<PointerType>(Ty)) {
	T = cast<PointerType>(Ty)->getPointeeType();
	Stack.push_back(Pointer);
	} else if (isa<BlockPointerType>(Ty)) {
	T = cast<BlockPointerType>(Ty)->getPointeeType();
	Stack.push_back(BlockPointer);
	} else if (isa<MemberPointerType>(Ty)) {
	T = cast<MemberPointerType>(Ty)->getPointeeType();
	Stack.push_back(MemberPointer);
	} else if (isa<ReferenceType>(Ty)) {
	T = cast<ReferenceType>(Ty)->getPointeeType();
	Stack.push_back(Reference);
	} else if (isa<AttributedType>(Ty)) {
	T = cast<AttributedType>(Ty)->getEquivalentType();
	Stack.push_back(Attributed);
	} else {
	const Type *DTy = Ty->getUnqualifiedDesugaredType();
	if (Ty == DTy) {
	Fn = nullptr;
	return;
	}

	T = QualType(DTy, 0);
	Stack.push_back(Desugar);
	}
	}
	}

	bool isFunctionType() const { return (Fn != nullptr); }
	const FunctionType *get() const { return Fn; }

	QualType wrap(Sema &S, const FunctionType *New) {
	// If T wasn't modified from the unwrapped type, do nothing.
	if (New == get()) return Original;

	Fn = New;
	return wrap(S.Context, Original, 0);
	}

	private:
	QualType wrap(ASTContext &C, QualType Old, unsigned I) {
	if (I == Stack.size())
	return C.getQualifiedType(Fn, Old.getQualifiers());

	// Build up the inner type, applying the qualifiers from the old
	// type to the new type.
	SplitQualType SplitOld = Old.split();

	// As a special case, tail-recurse if there are no qualifiers.
	if (SplitOld.Quals.empty())
	return wrap(C, SplitOld.Ty, I);
	return C.getQualifiedType(wrap(C, SplitOld.Ty, I), SplitOld.Quals);
	}

	QualType wrap(ASTContext &C, const Type *Old, unsigned I) {
	if (I == Stack.size()) return QualType(Fn, 0);

	switch (static_cast<WrapKind>(Stack[I++])) {
	case Desugar:
	// This is the point at which we potentially lose source
	// information.
	return wrap(C, Old->getUnqualifiedDesugaredType(), I);

	case Attributed:
	return wrap(C, cast<AttributedType>(Old)->getEquivalentType(), I);

	case Parens: {
	QualType New = wrap(C, cast<ParenType>(Old)->getInnerType(), I);
	return C.getParenType(New);
	}

	case Pointer: {
	QualType New = wrap(C, cast<PointerType>(Old)->getPointeeType(), I);
	return C.getPointerType(New);
	}

	case BlockPointer: {
	QualType New = wrap(C, cast<BlockPointerType>(Old)->getPointeeType(),I);
	return C.getBlockPointerType(New);
	}

	case MemberPointer: {
	const MemberPointerType *OldMPT = cast<MemberPointerType>(Old);
	QualType New = wrap(C, OldMPT->getPointeeType(), I);
	return C.getMemberPointerType(New, OldMPT->getClass());
	}

	case Reference: {
	const ReferenceType *OldRef = cast<ReferenceType>(Old);
	QualType New = wrap(C, OldRef->getPointeeType(), I);
	if (isa<LValueReferenceType>(OldRef))
	return C.getLValueReferenceType(New, OldRef->isSpelledAsLValue());
	else
	return C.getRValueReferenceType(New);
	}
	}

	llvm_unreachable("unknown wrapping kind");
	}
	};
	} // end anonymous namespace

	static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &State,
	ParsedAttr &PAttr, QualType &Type) {
	Sema &S = State.getSema();

	Attr *A;
	switch (PAttr.getKind()) {
	default: llvm_unreachable("Unknown attribute kind");
	case ParsedAttr::AT_Ptr32:
	A = createSimpleAttr<Ptr32Attr>(S.Context, PAttr);
	break;
	case ParsedAttr::AT_Ptr64:
	A = createSimpleAttr<Ptr64Attr>(S.Context, PAttr);
	break;
	case ParsedAttr::AT_SPtr:
	A = createSimpleAttr<SPtrAttr>(S.Context, PAttr);
	break;
	case ParsedAttr::AT_UPtr:
	A = createSimpleAttr<UPtrAttr>(S.Context, PAttr);
	break;
	}

	attr::Kind NewAttrKind = A->getKind();
	QualType Desugared = Type;
	const AttributedType *AT = dyn_cast<AttributedType>(Type);
	while (AT) {
	attr::Kind CurAttrKind = AT->getAttrKind();

	// You cannot specify duplicate type attributes, so if the attribute has
	// already been applied, flag it.
	if (NewAttrKind == CurAttrKind) {
	S.Diag(PAttr.getLoc(), diag::warn_duplicate_attribute_exact)
	<< PAttr.getName();
	return true;
	}

	// You cannot have both __sptr and __uptr on the same type, nor can you
	// have __ptr32 and __ptr64.
	if ((CurAttrKind == attr::Ptr32 && NewAttrKind == attr::Ptr64) \|\|
	(CurAttrKind == attr::Ptr64 && NewAttrKind == attr::Ptr32)) {
	S.Diag(PAttr.getLoc(), diag::err_attributes_are_not_compatible)
	<< "'__ptr32'" << "'__ptr64'";
	return true;
	} else if ((CurAttrKind == attr::SPtr && NewAttrKind == attr::UPtr) \|\|
	(CurAttrKind == attr::UPtr && NewAttrKind == attr::SPtr)) {
	S.Diag(PAttr.getLoc(), diag::err_attributes_are_not_compatible)
	<< "'__sptr'" << "'__uptr'";
	return true;
	}

	Desugared = AT->getEquivalentType();
	AT = dyn_cast<AttributedType>(Desugared);
	}

	// Pointer type qualifiers can only operate on pointer types, but not
	// pointer-to-member types.
	//
	// FIXME: Should we really be disallowing this attribute if there is any
	// type sugar between it and the pointer (other than attributes)? Eg, this
	// disallows the attribute on a parenthesized pointer.
	// And if so, should we really allow any type attribute?
	if (!isa<PointerType>(Desugared)) {
	if (Type->isMemberPointerType())
	S.Diag(PAttr.getLoc(), diag::err_attribute_no_member_pointers) << PAttr;
	else
	S.Diag(PAttr.getLoc(), diag::err_attribute_pointers_only) << PAttr << 0;
	return true;
	}

	Type = State.getAttributedType(A, Type, Type);
	return false;
	}

	/// Map a nullability attribute kind to a nullability kind.
	static NullabilityKind mapNullabilityAttrKind(ParsedAttr::Kind kind) {
	switch (kind) {
	case ParsedAttr::AT_TypeNonNull:
	return NullabilityKind::NonNull;

	case ParsedAttr::AT_TypeNullable:
	return NullabilityKind::Nullable;

	case ParsedAttr::AT_TypeNullUnspecified:
	return NullabilityKind::Unspecified;

	default:
	llvm_unreachable("not a nullability attribute kind");
	}
	}

	/// Applies a nullability type specifier to the given type, if possible.
	///
	/// \param state The type processing state.
	///
	/// \param type The type to which the nullability specifier will be
	/// added. On success, this type will be updated appropriately.
	///
	/// \param attr The attribute as written on the type.
	///
	/// \param allowOnArrayType Whether to accept nullability specifiers on an
	/// array type (e.g., because it will decay to a pointer).
	///
	/// \returns true if a problem has been diagnosed, false on success.
	static bool checkNullabilityTypeSpecifier(TypeProcessingState &state,
	QualType &type,
	ParsedAttr &attr,
	bool allowOnArrayType) {
	Sema &S = state.getSema();

	NullabilityKind nullability = mapNullabilityAttrKind(attr.getKind());
	SourceLocation nullabilityLoc = attr.getLoc();
	bool isContextSensitive = attr.isContextSensitiveKeywordAttribute();

	recordNullabilitySeen(S, nullabilityLoc);

	// Check for existing nullability attributes on the type.
	QualType desugared = type;
	while (auto attributed = dyn_cast<AttributedType>(desugared.getTypePtr())) {
	// Check whether there is already a null
	if (auto existingNullability = attributed->getImmediateNullability()) {
	// Duplicated nullability.
	if (nullability == *existingNullability) {
	S.Diag(nullabilityLoc, diag::warn_nullability_duplicate)
	<< DiagNullabilityKind(nullability, isContextSensitive)
	<< FixItHint::CreateRemoval(nullabilityLoc);

	break;
	}

	// Conflicting nullability.
	S.Diag(nullabilityLoc, diag::err_nullability_conflicting)
	<< DiagNullabilityKind(nullability, isContextSensitive)
	<< DiagNullabilityKind(*existingNullability, false);
	return true;
	}

	desugared = attributed->getModifiedType();
	}

	// If there is already a different nullability specifier, complain.
	// This (unlike the code above) looks through typedefs that might
	// have nullability specifiers on them, which means we cannot
	// provide a useful Fix-It.
	if (auto existingNullability = desugared->getNullability(S.Context)) {
	if (nullability != *existingNullability) {
	S.Diag(nullabilityLoc, diag::err_nullability_conflicting)
	<< DiagNullabilityKind(nullability, isContextSensitive)
	<< DiagNullabilityKind(*existingNullability, false);

	// Try to find the typedef with the existing nullability specifier.
	if (auto typedefType = desugared->getAs<TypedefType>()) {
	TypedefNameDecl *typedefDecl = typedefType->getDecl();
	QualType underlyingType = typedefDecl->getUnderlyingType();
	if (auto typedefNullability
	= AttributedType::stripOuterNullability(underlyingType)) {
	if (typedefNullability == existingNullability) {
	S.Diag(typedefDecl->getLocation(), diag::note_nullability_here)
	<< DiagNullabilityKind(*existingNullability, false);
	}
	}
	}

	return true;
	}
	}

	// If this definitely isn't a pointer type, reject the specifier.
	if (!desugared->canHaveNullability() &&
	!(allowOnArrayType && desugared->isArrayType())) {
	S.Diag(nullabilityLoc, diag::err_nullability_nonpointer)
	<< DiagNullabilityKind(nullability, isContextSensitive) << type;
	return true;
	}

	// For the context-sensitive keywords/Objective-C property
	// attributes, require that the type be a single-level pointer.
	if (isContextSensitive) {
	// Make sure that the pointee isn't itself a pointer type.
	const Type *pointeeType;
	if (desugared->isArrayType())
	pointeeType = desugared->getArrayElementTypeNoTypeQual();
	else
	pointeeType = desugared->getPointeeType().getTypePtr();

	if (pointeeType->isAnyPointerType() \|\|
	pointeeType->isObjCObjectPointerType() \|\|
	pointeeType->isMemberPointerType()) {
	S.Diag(nullabilityLoc, diag::err_nullability_cs_multilevel)
	<< DiagNullabilityKind(nullability, true)
	<< type;
	S.Diag(nullabilityLoc, diag::note_nullability_type_specifier)
	<< DiagNullabilityKind(nullability, false)
	<< type
	<< FixItHint::CreateReplacement(nullabilityLoc,
	getNullabilitySpelling(nullability));
	return true;
	}
	}

	// Form the attributed type.
	type = state.getAttributedType(
	createNullabilityAttr(S.Context, attr, nullability), type, type);
	return false;
	}

	/// Check the application of the Objective-C '__kindof' qualifier to
	/// the given type.
	static bool checkObjCKindOfType(TypeProcessingState &state, QualType &type,
	ParsedAttr &attr) {
	Sema &S = state.getSema();

	if (isa<ObjCTypeParamType>(type)) {
	// Build the attributed type to record where __kindof occurred.
	type = state.getAttributedType(
	createSimpleAttr<ObjCKindOfAttr>(S.Context, attr), type, type);
	return false;
	}

	// Find out if it's an Objective-C object or object pointer type;
	const ObjCObjectPointerType *ptrType = type->getAs<ObjCObjectPointerType>();
	const ObjCObjectType *objType = ptrType ? ptrType->getObjectType()
	: type->getAs<ObjCObjectType>();

	// If not, we can't apply __kindof.
	if (!objType) {
	// FIXME: Handle dependent types that aren't yet object types.
	S.Diag(attr.getLoc(), diag::err_objc_kindof_nonobject)
	<< type;
	return true;
	}

	// Rebuild the "equivalent" type, which pushes __kindof down into
	// the object type.
	// There is no need to apply kindof on an unqualified id type.
	QualType equivType = S.Context.getObjCObjectType(
	objType->getBaseType(), objType->getTypeArgsAsWritten(),
	objType->getProtocols(),
	/isKindOf=/objType->isObjCUnqualifiedId() ? false : true);

	// If we started with an object pointer type, rebuild it.
	if (ptrType) {
	equivType = S.Context.getObjCObjectPointerType(equivType);
	if (auto nullability = type->getNullability(S.Context)) {
	// We create a nullability attribute from the __kindof attribute.
	// Make sure that will make sense.
	assert(attr.getAttributeSpellingListIndex() == 0 &&
	"multiple spellings for __kindof?");
	Attr A = createNullabilityAttr(S.Context, attr, nullability);
	A->setImplicit(true);
	equivType = state.getAttributedType(A, equivType, equivType);
	}
	}

	// Build the attributed type to record where __kindof occurred.
	type = state.getAttributedType(
	createSimpleAttr<ObjCKindOfAttr>(S.Context, attr), type, equivType);
	return false;
	}

	/// Distribute a nullability type attribute that cannot be applied to
	/// the type specifier to a pointer, block pointer, or member pointer
	/// declarator, complaining if necessary.
	///
	/// \returns true if the nullability annotation was distributed, false
	/// otherwise.
	static bool distributeNullabilityTypeAttr(TypeProcessingState &state,
	QualType type, ParsedAttr &attr) {
	Declarator &declarator = state.getDeclarator();

	/// Attempt to move the attribute to the specified chunk.
	auto moveToChunk = [&](DeclaratorChunk &chunk, bool inFunction) -> bool {
	// If there is already a nullability attribute there, don't add
	// one.
	if (hasNullabilityAttr(chunk.getAttrs()))
	return false;

	// Complain about the nullability qualifier being in the wrong
	// place.
	enum {
	PK_Pointer,
	PK_BlockPointer,
	PK_MemberPointer,
	PK_FunctionPointer,
	PK_MemberFunctionPointer,
	} pointerKind
	= chunk.Kind == DeclaratorChunk::Pointer ? (inFunction ? PK_FunctionPointer
	: PK_Pointer)
	: chunk.Kind == DeclaratorChunk::BlockPointer ? PK_BlockPointer
	: inFunction? PK_MemberFunctionPointer : PK_MemberPointer;

	auto diag = state.getSema().Diag(attr.getLoc(),
	diag::warn_nullability_declspec)
	<< DiagNullabilityKind(mapNullabilityAttrKind(attr.getKind()),
	attr.isContextSensitiveKeywordAttribute())
	<< type
	<< static_cast<unsigned>(pointerKind);

	// FIXME: MemberPointer chunks don't carry the location of the *.
	if (chunk.Kind != DeclaratorChunk::MemberPointer) {
	diag << FixItHint::CreateRemoval(attr.getLoc())
	<< FixItHint::CreateInsertion(
	state.getSema().getPreprocessor()
	.getLocForEndOfToken(chunk.Loc),
	" " + attr.getName()->getName().str() + " ");
	}

	moveAttrFromListToList(attr, state.getCurrentAttributes(),
	chunk.getAttrs());
	return true;
	};

	// Move it to the outermost pointer, member pointer, or block
	// pointer declarator.
	for (unsigned i = state.getCurrentChunkIndex(); i != 0; --i) {
	DeclaratorChunk &chunk = declarator.getTypeObject(i-1);
	switch (chunk.Kind) {
	case DeclaratorChunk::Pointer:
	case DeclaratorChunk::BlockPointer:
	case DeclaratorChunk::MemberPointer:
	return moveToChunk(chunk, false);

	case DeclaratorChunk::Paren:
	case DeclaratorChunk::Array:
	continue;

	case DeclaratorChunk::Function:
	// Try to move past the return type to a function/block/member
	// function pointer.
	if (DeclaratorChunk *dest = maybeMovePastReturnType(
	declarator, i,
	/onlyBlockPointers=/false)) {
	return moveToChunk(*dest, true);
	}

	return false;

	// Don't walk through these.
	case DeclaratorChunk::Reference:
	case DeclaratorChunk::Pipe:
	return false;
	}
	}

	return false;
	}

	static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) {
	assert(!Attr.isInvalid());
	switch (Attr.getKind()) {
	default:
	llvm_unreachable("not a calling convention attribute");
	case ParsedAttr::AT_CDecl:
	return createSimpleAttr<CDeclAttr>(Ctx, Attr);
	case ParsedAttr::AT_FastCall:
	return createSimpleAttr<FastCallAttr>(Ctx, Attr);
	case ParsedAttr::AT_StdCall:
	return createSimpleAttr<StdCallAttr>(Ctx, Attr);
	case ParsedAttr::AT_ThisCall:
	return createSimpleAttr<ThisCallAttr>(Ctx, Attr);
	case ParsedAttr::AT_RegCall:
	return createSimpleAttr<RegCallAttr>(Ctx, Attr);
	case ParsedAttr::AT_Pascal:
	return createSimpleAttr<PascalAttr>(Ctx, Attr);
	case ParsedAttr::AT_SwiftCall:
	return createSimpleAttr<SwiftCallAttr>(Ctx, Attr);
	case ParsedAttr::AT_VectorCall:
	return createSimpleAttr<VectorCallAttr>(Ctx, Attr);
	case ParsedAttr::AT_AArch64VectorPcs:
	return createSimpleAttr<AArch64VectorPcsAttr>(Ctx, Attr);
	case ParsedAttr::AT_Pcs: {
	// The attribute may have had a fixit applied where we treated an
	// identifier as a string literal. The contents of the string are valid,
	// but the form may not be.
	StringRef Str;
	if (Attr.isArgExpr(0))
	Str = cast<StringLiteral>(Attr.getArgAsExpr(0))->getString();
	else
	Str = Attr.getArgAsIdent(0)->Ident->getName();
	PcsAttr::PCSType Type;
	if (!PcsAttr::ConvertStrToPCSType(Str, Type))
	llvm_unreachable("already validated the attribute");
	return ::new (Ctx) PcsAttr(Attr.getRange(), Ctx, Type,
	Attr.getAttributeSpellingListIndex());
	}
	case ParsedAttr::AT_IntelOclBicc:
	return createSimpleAttr<IntelOclBiccAttr>(Ctx, Attr);
	case ParsedAttr::AT_MSABI:
	return createSimpleAttr<MSABIAttr>(Ctx, Attr);
	case ParsedAttr::AT_SysVABI:
	return createSimpleAttr<SysVABIAttr>(Ctx, Attr);
	case ParsedAttr::AT_PreserveMost:
	return createSimpleAttr<PreserveMostAttr>(Ctx, Attr);
	case ParsedAttr::AT_PreserveAll:
	return createSimpleAttr<PreserveAllAttr>(Ctx, Attr);
	}
	llvm_unreachable("unexpected attribute kind!");
	}

	/// Process an individual function attribute. Returns true to
	/// indicate that the attribute was handled, false if it wasn't.
	static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
	QualType &type) {
	Sema &S = state.getSema();

	FunctionTypeUnwrapper unwrapped(S, type);

	if (attr.getKind() == ParsedAttr::AT_NoReturn) {
	if (S.CheckAttrNoArgs(attr))
	return true;

	// Delay if this is not a function type.
	if (!unwrapped.isFunctionType())
	return false;

	// Otherwise we can process right away.
	FunctionType::ExtInfo EI = unwrapped.get()->getExtInfo().withNoReturn(true);
	type = unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI));
	return true;
	}

	// ns_returns_retained is not always a type attribute, but if we got
	// here, we're treating it as one right now.
	if (attr.getKind() == ParsedAttr::AT_NSReturnsRetained) {
	if (attr.getNumArgs()) return true;

	// Delay if this is not a function type.
	if (!unwrapped.isFunctionType())
	return false;

	// Check whether the return type is reasonable.
	if (S.checkNSReturnsRetainedReturnType(attr.getLoc(),
	unwrapped.get()->getReturnType()))
	return true;

	// Only actually change the underlying type in ARC builds.
	QualType origType = type;
	if (state.getSema().getLangOpts().ObjCAutoRefCount) {
	FunctionType::ExtInfo EI
	= unwrapped.get()->getExtInfo().withProducesResult(true);
	type = unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI));
	}
	type = state.getAttributedType(
	createSimpleAttr<NSReturnsRetainedAttr>(S.Context, attr),
	origType, type);
	return true;
	}

	if (attr.getKind() == ParsedAttr::AT_AnyX86NoCallerSavedRegisters) {
	if (S.CheckAttrTarget(attr) \|\| S.CheckAttrNoArgs(attr))
	return true;

	// Delay if this is not a function type.
	if (!unwrapped.isFunctionType())
	return false;

	FunctionType::ExtInfo EI =
	unwrapped.get()->getExtInfo().withNoCallerSavedRegs(true);
	type = unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI));
	return true;
	}

	if (attr.getKind() == ParsedAttr::AT_AnyX86NoCfCheck) {
	if (!S.getLangOpts().CFProtectionBranch) {
	S.Diag(attr.getLoc(), diag::warn_nocf_check_attribute_ignored);
	attr.setInvalid();
	return true;
	}

	if (S.CheckAttrTarget(attr) \|\| S.CheckAttrNoArgs(attr))
	return true;

	// If this is not a function type, warning will be asserted by subject
	// check.
	if (!unwrapped.isFunctionType())
	return true;

	FunctionType::ExtInfo EI =
	unwrapped.get()->getExtInfo().withNoCfCheck(true);
	type = unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI));
	return true;
	}

	if (attr.getKind() == ParsedAttr::AT_Regparm) {
	unsigned value;
	if (S.CheckRegparmAttr(attr, value))
	return true;

	// Delay if this is not a function type.
	if (!unwrapped.isFunctionType())
	return false;

	// Diagnose regparm with fastcall.
	const FunctionType *fn = unwrapped.get();
	CallingConv CC = fn->getCallConv();
	if (CC == CC_X86FastCall) {
	S.Diag(attr.getLoc(), diag::err_attributes_are_not_compatible)
	<< FunctionType::getNameForCallConv(CC)
	<< "regparm";
	attr.setInvalid();
	return true;
	}

	FunctionType::ExtInfo EI =
	unwrapped.get()->getExtInfo().withRegParm(value);
	type = unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI));
	return true;
	}

	if (attr.getKind() == ParsedAttr::AT_NoThrow) {
	// Delay if this is not a function type.
	if (!unwrapped.isFunctionType())
	return false;

	if (S.CheckAttrNoArgs(attr)) {
	attr.setInvalid();
	return true;
	}

	// Otherwise we can process right away.
	auto *Proto = unwrapped.get()->castAs<FunctionProtoType>();

	// MSVC ignores nothrow if it is in conflict with an explicit exception
	// specification.
	if (Proto->hasExceptionSpec()) {
	switch (Proto->getExceptionSpecType()) {
	case EST_None:
	llvm_unreachable("This doesn't have an exception spec!");

	case EST_DynamicNone:
	case EST_BasicNoexcept:
	case EST_NoexceptTrue:
	case EST_NoThrow:
	// Exception spec doesn't conflict with nothrow, so don't warn.
	LLVM_FALLTHROUGH;
	case EST_Unparsed:
	case EST_Uninstantiated:
	case EST_DependentNoexcept:
	case EST_Unevaluated:
	// We don't have enough information to properly determine if there is a
	// conflict, so suppress the warning.
	break;
	case EST_Dynamic:
	case EST_MSAny:
	case EST_NoexceptFalse:
	S.Diag(attr.getLoc(), diag::warn_nothrow_attribute_ignored);
	break;
	}
	return true;
	}

	type = unwrapped.wrap(
	S, S.Context
	.getFunctionTypeWithExceptionSpec(
	QualType{Proto, 0},
	FunctionProtoType::ExceptionSpecInfo{EST_NoThrow})
	->getAs<FunctionType>());
	return true;
	}

	// Delay if the type didn't work out to a function.
	if (!unwrapped.isFunctionType()) return false;

	// Otherwise, a calling convention.
	CallingConv CC;
	if (S.CheckCallingConvAttr(attr, CC))
	return true;

	const FunctionType *fn = unwrapped.get();
	CallingConv CCOld = fn->getCallConv();
	Attr *CCAttr = getCCTypeAttr(S.Context, attr);

	if (CCOld != CC) {
	// Error out on when there's already an attribute on the type
	// and the CCs don't match.
	if (S.getCallingConvAttributedType(type)) {
	S.Diag(attr.getLoc(), diag::err_attributes_are_not_compatible)
	<< FunctionType::getNameForCallConv(CC)
	<< FunctionType::getNameForCallConv(CCOld);
	attr.setInvalid();
	return true;
	}
	}

	// Diagnose use of variadic functions with calling conventions that
	// don't support them (e.g. because they're callee-cleanup).
	// We delay warning about this on unprototyped function declarations
	// until after redeclaration checking, just in case we pick up a
	// prototype that way. And apparently we also "delay" warning about
	// unprototyped function types in general, despite not necessarily having
	// much ability to diagnose it later.
	if (!supportsVariadicCall(CC)) {
	const FunctionProtoType *FnP = dyn_cast<FunctionProtoType>(fn);
	if (FnP && FnP->isVariadic()) {
	// stdcall and fastcall are ignored with a warning for GCC and MS
	// compatibility.
	if (CC == CC_X86StdCall \|\| CC == CC_X86FastCall)
	return S.Diag(attr.getLoc(), diag::warn_cconv_unsupported)
	<< FunctionType::getNameForCallConv(CC)
	<< (int)Sema::CallingConventionIgnoredReason::VariadicFunction;

	attr.setInvalid();
	return S.Diag(attr.getLoc(), diag::err_cconv_varargs)
	<< FunctionType::getNameForCallConv(CC);
	}
	}

	// Also diagnose fastcall with regparm.
	if (CC == CC_X86FastCall && fn->getHasRegParm()) {
	S.Diag(attr.getLoc(), diag::err_attributes_are_not_compatible)
	<< "regparm" << FunctionType::getNameForCallConv(CC_X86FastCall);
	attr.setInvalid();
	return true;
	}

	// Modify the CC from the wrapped function type, wrap it all back, and then
	// wrap the whole thing in an AttributedType as written. The modified type
	// might have a different CC if we ignored the attribute.
	QualType Equivalent;
	if (CCOld == CC) {
	Equivalent = type;
	} else {
	auto EI = unwrapped.get()->getExtInfo().withCallingConv(CC);
	Equivalent =
	unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI));
	}
	type = state.getAttributedType(CCAttr, type, Equivalent);
	return true;
	}

	bool Sema::hasExplicitCallingConv(QualType T) {
	const AttributedType *AT;

	// Stop if we'd be stripping off a typedef sugar node to reach the
	// AttributedType.
	while ((AT = T->getAs<AttributedType>()) &&
	AT->getAs<TypedefType>() == T->getAs<TypedefType>()) {
	if (AT->isCallingConv())
	return true;
	T = AT->getModifiedType();
	}
	return false;
	}

	void Sema::adjustMemberFunctionCC(QualType &T, bool IsStatic, bool IsCtorOrDtor,
	SourceLocation Loc) {
	FunctionTypeUnwrapper Unwrapped(*this, T);
	const FunctionType *FT = Unwrapped.get();
	bool IsVariadic = (isa<FunctionProtoType>(FT) &&
	cast<FunctionProtoType>(FT)->isVariadic());
	CallingConv CurCC = FT->getCallConv();
	CallingConv ToCC = Context.getDefaultCallingConvention(IsVariadic, !IsStatic);

	if (CurCC == ToCC)
	return;

	// MS compiler ignores explicit calling convention attributes on structors. We
	// should do the same.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft() && IsCtorOrDtor) {
	// Issue a warning on ignored calling convention -- except of __stdcall.
	// Again, this is what MS compiler does.
	if (CurCC != CC_X86StdCall)
	Diag(Loc, diag::warn_cconv_unsupported)
	<< FunctionType::getNameForCallConv(CurCC)
	<< (int)Sema::CallingConventionIgnoredReason::ConstructorDestructor;
	// Default adjustment.
	} else {
	// Only adjust types with the default convention. For example, on Windows
	// we should adjust a __cdecl type to __thiscall for instance methods, and a
	// __thiscall type to __cdecl for static methods.
	CallingConv DefaultCC =
	Context.getDefaultCallingConvention(IsVariadic, IsStatic);

	if (CurCC != DefaultCC \|\| DefaultCC == ToCC)
	return;

	if (hasExplicitCallingConv(T))
	return;
	}

	FT = Context.adjustFunctionType(FT, FT->getExtInfo().withCallingConv(ToCC));
	QualType Wrapped = Unwrapped.wrap(*this, FT);
	T = Context.getAdjustedType(T, Wrapped);
	}

	/// HandleVectorSizeAttribute - this attribute is only applicable to integral
	/// and float scalars, although arrays, pointers, and function return values are
	/// allowed in conjunction with this construct. Aggregates with this attribute
	/// are invalid, even if they are of the same size as a corresponding scalar.
	/// The raw attribute should contain precisely 1 argument, the vector size for
	/// the variable, measured in bytes. If curType and rawAttr are well formed,
	/// this routine will return a new vector type.
	static void HandleVectorSizeAttr(QualType &CurType, const ParsedAttr &Attr,
	Sema &S) {
	// Check the attribute arguments.
	if (Attr.getNumArgs() != 1) {
	S.Diag(Attr.getLoc(), diag::err_attribute_wrong_number_arguments) << Attr
	<< 1;
	Attr.setInvalid();
	return;
	}

	Expr *SizeExpr;
	// Special case where the argument is a template id.
	if (Attr.isArgIdent(0)) {
	CXXScopeSpec SS;
	SourceLocation TemplateKWLoc;
	UnqualifiedId Id;
	Id.setIdentifier(Attr.getArgAsIdent(0)->Ident, Attr.getLoc());

	ExprResult Size = S.ActOnIdExpression(S.getCurScope(), SS, TemplateKWLoc,
	Id, /HasTrailingLParen=/false,
	/IsAddressOfOperand=/false);

	if (Size.isInvalid())
	return;
	SizeExpr = Size.get();
	} else {
	SizeExpr = Attr.getArgAsExpr(0);
	}

	QualType T = S.BuildVectorType(CurType, SizeExpr, Attr.getLoc());
	if (!T.isNull())
	CurType = T;
	else
	Attr.setInvalid();
	}

	/// Process the OpenCL-like ext_vector_type attribute when it occurs on
	/// a type.
	static void HandleExtVectorTypeAttr(QualType &CurType, const ParsedAttr &Attr,
	Sema &S) {
	// check the attribute arguments.
	if (Attr.getNumArgs() != 1) {
	S.Diag(Attr.getLoc(), diag::err_attribute_wrong_number_arguments) << Attr
	<< 1;
	return;
	}

	Expr *sizeExpr;

	// Special case where the argument is a template id.
	if (Attr.isArgIdent(0)) {
	CXXScopeSpec SS;
	SourceLocation TemplateKWLoc;
	UnqualifiedId id;
	id.setIdentifier(Attr.getArgAsIdent(0)->Ident, Attr.getLoc());

	ExprResult Size = S.ActOnIdExpression(S.getCurScope(), SS, TemplateKWLoc,
	id, /HasTrailingLParen=/false,
	/IsAddressOfOperand=/false);
	if (Size.isInvalid())
	return;

	sizeExpr = Size.get();
	} else {
	sizeExpr = Attr.getArgAsExpr(0);
	}

	// Create the vector type.
	QualType T = S.BuildExtVectorType(CurType, sizeExpr, Attr.getLoc());
	if (!T.isNull())
	CurType = T;
	}

	static bool isPermittedNeonBaseType(QualType &Ty,
	VectorType::VectorKind VecKind, Sema &S) {
	const BuiltinType *BTy = Ty->getAs<BuiltinType>();
	if (!BTy)
	return false;

	llvm::Triple Triple = S.Context.getTargetInfo().getTriple();

	// Signed poly is mathematically wrong, but has been baked into some ABIs by
	// now.
	bool IsPolyUnsigned = Triple.getArch() == llvm::Triple::aarch64 \|\|
	Triple.getArch() == llvm::Triple::aarch64_be;
	if (VecKind == VectorType::NeonPolyVector) {
	if (IsPolyUnsigned) {
	// AArch64 polynomial vectors are unsigned and support poly64.
	return BTy->getKind() == BuiltinType::UChar \|\|
	BTy->getKind() == BuiltinType::UShort \|\|
	BTy->getKind() == BuiltinType::ULong \|\|
	BTy->getKind() == BuiltinType::ULongLong;
	} else {
	// AArch32 polynomial vector are signed.
	return BTy->getKind() == BuiltinType::SChar \|\|
	BTy->getKind() == BuiltinType::Short;
	}
	}

	// Non-polynomial vector types: the usual suspects are allowed, as well as
	// float64_t on AArch64.
	bool Is64Bit = Triple.getArch() == llvm::Triple::aarch64 \|\|
	Triple.getArch() == llvm::Triple::aarch64_be;

	if (Is64Bit && BTy->getKind() == BuiltinType::Double)
	return true;

	return BTy->getKind() == BuiltinType::SChar \|\|
	BTy->getKind() == BuiltinType::UChar \|\|
	BTy->getKind() == BuiltinType::Short \|\|
	BTy->getKind() == BuiltinType::UShort \|\|
	BTy->getKind() == BuiltinType::Int \|\|
	BTy->getKind() == BuiltinType::UInt \|\|
	BTy->getKind() == BuiltinType::Long \|\|
	BTy->getKind() == BuiltinType::ULong \|\|
	BTy->getKind() == BuiltinType::LongLong \|\|
	BTy->getKind() == BuiltinType::ULongLong \|\|
	BTy->getKind() == BuiltinType::Float \|\|
	BTy->getKind() == BuiltinType::Half;
	}

	/// HandleNeonVectorTypeAttr - The "neon_vector_type" and
	/// "neon_polyvector_type" attributes are used to create vector types that
	/// are mangled according to ARM's ABI. Otherwise, these types are identical
	/// to those created with the "vector_size" attribute. Unlike "vector_size"
	/// the argument to these Neon attributes is the number of vector elements,
	/// not the vector size in bytes. The vector width and element type must
	/// match one of the standard Neon vector types.
	static void HandleNeonVectorTypeAttr(QualType &CurType, const ParsedAttr &Attr,
	Sema &S, VectorType::VectorKind VecKind) {
	// Target must have NEON
	if (!S.Context.getTargetInfo().hasFeature("neon")) {
	S.Diag(Attr.getLoc(), diag::err_attribute_unsupported) << Attr;
	Attr.setInvalid();
	return;
	}
	// Check the attribute arguments.
	if (Attr.getNumArgs() != 1) {
	S.Diag(Attr.getLoc(), diag::err_attribute_wrong_number_arguments) << Attr
	<< 1;
	Attr.setInvalid();
	return;
	}
	// The number of elements must be an ICE.
	Expr numEltsExpr = static_cast<Expr >(Attr.getArgAsExpr(0));
	llvm::APSInt numEltsInt(32);
	if (numEltsExpr->isTypeDependent() \|\| numEltsExpr->isValueDependent() \|\|
	!numEltsExpr->isIntegerConstantExpr(numEltsInt, S.Context)) {
	S.Diag(Attr.getLoc(), diag::err_attribute_argument_type)
	<< Attr << AANT_ArgumentIntegerConstant
	<< numEltsExpr->getSourceRange();
	Attr.setInvalid();
	return;
	}
	// Only certain element types are supported for Neon vectors.
	if (!isPermittedNeonBaseType(CurType, VecKind, S)) {
	S.Diag(Attr.getLoc(), diag::err_attribute_invalid_vector_type) << CurType;
	Attr.setInvalid();
	return;
	}

	// The total size of the vector must be 64 or 128 bits.
	unsigned typeSize = static_cast<unsigned>(S.Context.getTypeSize(CurType));
	unsigned numElts = static_cast<unsigned>(numEltsInt.getZExtValue());
	unsigned vecSize = typeSize * numElts;
	if (vecSize != 64 && vecSize != 128) {
	S.Diag(Attr.getLoc(), diag::err_attribute_bad_neon_vector_size) << CurType;
	Attr.setInvalid();
	return;
	}

	CurType = S.Context.getVectorType(CurType, numElts, VecKind);
	}

	/// Handle OpenCL Access Qualifier Attribute.
	static void HandleOpenCLAccessAttr(QualType &CurType, const ParsedAttr &Attr,
	Sema &S) {
	// OpenCL v2.0 s6.6 - Access qualifier can be used only for image and pipe type.
	if (!(CurType->isImageType() \|\| CurType->isPipeType())) {
	S.Diag(Attr.getLoc(), diag::err_opencl_invalid_access_qualifier);
	Attr.setInvalid();
	return;
	}

	if (const TypedefType* TypedefTy = CurType->getAs<TypedefType>()) {
	QualType BaseTy = TypedefTy->desugar();

	std::string PrevAccessQual;
	if (BaseTy->isPipeType()) {
	if (TypedefTy->getDecl()->hasAttr<OpenCLAccessAttr>()) {
	OpenCLAccessAttr *Attr =
	TypedefTy->getDecl()->getAttr<OpenCLAccessAttr>();
	PrevAccessQual = Attr->getSpelling();
	} else {
	PrevAccessQual = "read_only";
	}
	} else if (const BuiltinType* ImgType = BaseTy->getAs<BuiltinType>()) {

	switch (ImgType->getKind()) {
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id: \
	PrevAccessQual = #Access; \
	break;
	#include "clang/Basic/OpenCLImageTypes.def"
	default:
	llvm_unreachable("Unable to find corresponding image type.");
	}
	} else {
	llvm_unreachable("unexpected type");
	}
	StringRef AttrName = Attr.getName()->getName();
	if (PrevAccessQual == AttrName.ltrim("_")) {
	// Duplicated qualifiers
	S.Diag(Attr.getLoc(), diag::warn_duplicate_declspec)
	<< AttrName << Attr.getRange();
	} else {
	// Contradicting qualifiers
	S.Diag(Attr.getLoc(), diag::err_opencl_multiple_access_qualifiers);
	}

	S.Diag(TypedefTy->getDecl()->getBeginLoc(),
	diag::note_opencl_typedef_access_qualifier) << PrevAccessQual;
	} else if (CurType->isPipeType()) {
	if (Attr.getSemanticSpelling() == OpenCLAccessAttr::Keyword_write_only) {
	QualType ElemType = CurType->getAs<PipeType>()->getElementType();
	CurType = S.Context.getWritePipeType(ElemType);
	}
	}
	}

	static void deduceOpenCLImplicitAddrSpace(TypeProcessingState &State,
	QualType &T, TypeAttrLocation TAL) {
	Declarator &D = State.getDeclarator();

	// Handle the cases where address space should not be deduced.
	//
	// The pointee type of a pointer type is always deduced since a pointer always
	// points to some memory location which should has an address space.
	//
	// There are situations that at the point of certain declarations, the address
	// space may be unknown and better to be left as default. For example, when
	// defining a typedef or struct type, they are not associated with any
	// specific address space. Later on, they may be used with any address space
	// to declare a variable.
	//
	// The return value of a function is r-value, therefore should not have
	// address space.
	//
	// The void type does not occupy memory, therefore should not have address
	// space, except when it is used as a pointee type.
	//
	// Since LLVM assumes function type is in default address space, it should not
	// have address space.
	auto ChunkIndex = State.getCurrentChunkIndex();
	bool IsPointee =
	ChunkIndex > 0 &&
	(D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::Pointer \|\|
	- D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::BlockPointer \|\|
	- D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::Reference);
	+ D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::Reference \|\|
	+ D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::BlockPointer);
	+ // For pointers/references to arrays the next chunk is always an array
	+ // followed by any number of parentheses.
	+ if (!IsPointee && ChunkIndex > 1) {
	+ auto AdjustedCI = ChunkIndex - 1;
	+ if (D.getTypeObject(AdjustedCI).Kind == DeclaratorChunk::Array)
	+ AdjustedCI--;
	+ // Skip over all parentheses.
	+ while (AdjustedCI > 0 &&
	+ D.getTypeObject(AdjustedCI).Kind == DeclaratorChunk::Paren)
	+ AdjustedCI--;
	+ if (D.getTypeObject(AdjustedCI).Kind == DeclaratorChunk::Pointer \|\|
	+ D.getTypeObject(AdjustedCI).Kind == DeclaratorChunk::Reference)
	+ IsPointee = true;
	+ }
	bool IsFuncReturnType =
	ChunkIndex > 0 &&
	D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::Function;
	bool IsFuncType =
	ChunkIndex < D.getNumTypeObjects() &&
	D.getTypeObject(ChunkIndex).Kind == DeclaratorChunk::Function;
	if ( // Do not deduce addr space for function return type and function type,
	// otherwise it will fail some sema check.
	IsFuncReturnType \|\| IsFuncType \|\|
	// Do not deduce addr space for member types of struct, except the pointee
	// type of a pointer member type or static data members.
	(D.getContext() == DeclaratorContext::MemberContext &&
	(!IsPointee &&
	D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_static)) \|\|
	// Do not deduce addr space of non-pointee in type alias because it
	// doesn't define any object.
	(D.getContext() == DeclaratorContext::AliasDeclContext && !IsPointee) \|\|
	// Do not deduce addr space for types used to define a typedef and the
	// typedef itself, except the pointee type of a pointer type which is used
	// to define the typedef.
	(D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_typedef &&
	!IsPointee) \|\|
	// Do not deduce addr space of the void type, e.g. in f(void), otherwise
	// it will fail some sema check.
	(T->isVoidType() && !IsPointee) \|\|
	// Do not deduce addr spaces for dependent types because they might end
	// up instantiating to a type with an explicit address space qualifier.
	// Except for pointer or reference types because the addr space in
	// template argument can only belong to a pointee.
	(T->isDependentType() && !T->isPointerType() && !T->isReferenceType()) \|\|
	// Do not deduce addr space of decltype because it will be taken from
	// its argument.
	T->isDecltypeType() \|\|
	// OpenCL spec v2.0 s6.9.b:
	// The sampler type cannot be used with the __local and __global address
	// space qualifiers.
	// OpenCL spec v2.0 s6.13.14:
	// Samplers can also be declared as global constants in the program
	// source using the following syntax.
	// const sampler_t <sampler name> = <value>
	// In codegen, file-scope sampler type variable has special handing and
	// does not rely on address space qualifier. On the other hand, deducing
	// address space of const sampler file-scope variable as global address
	// space causes spurious diagnostic about __global address space
	// qualifier, therefore do not deduce address space of file-scope sampler
	// type variable.
	(D.getContext() == DeclaratorContext::FileContext && T->isSamplerT()))
	return;

	LangAS ImpAddr = LangAS::Default;
	// Put OpenCL automatic variable in private address space.
	// OpenCL v1.2 s6.5:
	// The default address space name for arguments to a function in a
	// program, or local variables of a function is __private. All function
	// arguments shall be in the __private address space.
	if (State.getSema().getLangOpts().OpenCLVersion <= 120 &&
	!State.getSema().getLangOpts().OpenCLCPlusPlus) {
	ImpAddr = LangAS::opencl_private;
	} else {
	// If address space is not set, OpenCL 2.0 defines non private default
	// address spaces for some cases:
	// OpenCL 2.0, section 6.5:
	// The address space for a variable at program scope or a static variable
	// inside a function can either be __global or __constant, but defaults to
	// __global if not specified.
	// (...)
	// Pointers that are declared without pointing to a named address space
	// point to the generic address space.
	if (IsPointee) {
	ImpAddr = LangAS::opencl_generic;
	} else {
	if (D.getContext() == DeclaratorContext::TemplateArgContext) {
	// Do not deduce address space for non-pointee type in template arg.
	} else if (D.getContext() == DeclaratorContext::FileContext) {
	ImpAddr = LangAS::opencl_global;
	} else {
	if (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_static \|\|
	D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_extern) {
	ImpAddr = LangAS::opencl_global;
	} else {
	ImpAddr = LangAS::opencl_private;
	}
	}
	}
	}
	T = State.getSema().Context.getAddrSpaceQualType(T, ImpAddr);
	}

	static void HandleLifetimeBoundAttr(TypeProcessingState &State,
	QualType &CurType,
	ParsedAttr &Attr) {
	if (State.getDeclarator().isDeclarationOfFunction()) {
	CurType = State.getAttributedType(
	createSimpleAttr<LifetimeBoundAttr>(State.getSema().Context, Attr),
	CurType, CurType);
	} else {
	Attr.diagnoseAppertainsTo(State.getSema(), nullptr);
	}
	}


	static void processTypeAttrs(TypeProcessingState &state, QualType &type,
	TypeAttrLocation TAL,
	ParsedAttributesView &attrs) {
	// Scan through and apply attributes to this type where it makes sense. Some
	// attributes (such as __address_space__, __vector_size__, etc) apply to the
	// type, but others can be present in the type specifiers even though they
	// apply to the decl. Here we apply type attributes and ignore the rest.

	// This loop modifies the list pretty frequently, but we still need to make
	// sure we visit every element once. Copy the attributes list, and iterate
	// over that.
	ParsedAttributesView AttrsCopy{attrs};

	state.setParsedNoDeref(false);

	for (ParsedAttr &attr : AttrsCopy) {

	// Skip attributes that were marked to be invalid.
	if (attr.isInvalid())
	continue;

	if (attr.isCXX11Attribute()) {
	// [[gnu::...]] attributes are treated as declaration attributes, so may
	// not appertain to a DeclaratorChunk. If we handle them as type
	// attributes, accept them in that position and diagnose the GCC
	// incompatibility.
	if (attr.isGNUScope()) {
	bool IsTypeAttr = attr.isTypeAttr();
	if (TAL == TAL_DeclChunk) {
	state.getSema().Diag(attr.getLoc(),
	IsTypeAttr
	? diag::warn_gcc_ignores_type_attr
	: diag::warn_cxx11_gnu_attribute_on_type)
	<< attr.getName();
	if (!IsTypeAttr)
	continue;
	}
	} else if (TAL != TAL_DeclChunk &&
	attr.getKind() != ParsedAttr::AT_AddressSpace) {
	// Otherwise, only consider type processing for a C++11 attribute if
	// it's actually been applied to a type.
	// We also allow C++11 address_space attributes to pass through.
	continue;
	}
	}

	// If this is an attribute we can handle, do so now,
	// otherwise, add it to the FnAttrs list for rechaining.
	switch (attr.getKind()) {
	default:
	// A C++11 attribute on a declarator chunk must appertain to a type.
	if (attr.isCXX11Attribute() && TAL == TAL_DeclChunk) {
	state.getSema().Diag(attr.getLoc(), diag::err_attribute_not_type_attr)
	<< attr;
	attr.setUsedAsTypeAttr();
	}
	break;

	case ParsedAttr::UnknownAttribute:
	if (attr.isCXX11Attribute() && TAL == TAL_DeclChunk)
	state.getSema().Diag(attr.getLoc(),
	diag::warn_unknown_attribute_ignored)
	<< attr.getName();
	break;

	case ParsedAttr::IgnoredAttribute:
	break;

	case ParsedAttr::AT_MayAlias:
	// FIXME: This attribute needs to actually be handled, but if we ignore
	// it it breaks large amounts of Linux software.
	attr.setUsedAsTypeAttr();
	break;
	case ParsedAttr::AT_OpenCLPrivateAddressSpace:
	case ParsedAttr::AT_OpenCLGlobalAddressSpace:
	case ParsedAttr::AT_OpenCLLocalAddressSpace:
	case ParsedAttr::AT_OpenCLConstantAddressSpace:
	case ParsedAttr::AT_OpenCLGenericAddressSpace:
	case ParsedAttr::AT_AddressSpace:
	HandleAddressSpaceTypeAttribute(type, attr, state);
	attr.setUsedAsTypeAttr();
	break;
	OBJC_POINTER_TYPE_ATTRS_CASELIST:
	if (!handleObjCPointerTypeAttr(state, attr, type))
	distributeObjCPointerTypeAttr(state, attr, type);
	attr.setUsedAsTypeAttr();
	break;
	case ParsedAttr::AT_VectorSize:
	HandleVectorSizeAttr(type, attr, state.getSema());
	attr.setUsedAsTypeAttr();
	break;
	case ParsedAttr::AT_ExtVectorType:
	HandleExtVectorTypeAttr(type, attr, state.getSema());
	attr.setUsedAsTypeAttr();
	break;
	case ParsedAttr::AT_NeonVectorType:
	HandleNeonVectorTypeAttr(type, attr, state.getSema(),
	VectorType::NeonVector);
	attr.setUsedAsTypeAttr();
	break;
	case ParsedAttr::AT_NeonPolyVectorType:
	HandleNeonVectorTypeAttr(type, attr, state.getSema(),
	VectorType::NeonPolyVector);
	attr.setUsedAsTypeAttr();
	break;
	case ParsedAttr::AT_OpenCLAccess:
	HandleOpenCLAccessAttr(type, attr, state.getSema());
	attr.setUsedAsTypeAttr();
	break;
	case ParsedAttr::AT_LifetimeBound:
	if (TAL == TAL_DeclChunk)
	HandleLifetimeBoundAttr(state, type, attr);
	break;

	case ParsedAttr::AT_NoDeref: {
	ASTContext &Ctx = state.getSema().Context;
	type = state.getAttributedType(createSimpleAttr<NoDerefAttr>(Ctx, attr),
	type, type);
	attr.setUsedAsTypeAttr();
	state.setParsedNoDeref(true);
	break;
	}

	MS_TYPE_ATTRS_CASELIST:
	if (!handleMSPointerTypeQualifierAttr(state, attr, type))
	attr.setUsedAsTypeAttr();
	break;


	NULLABILITY_TYPE_ATTRS_CASELIST:
	// Either add nullability here or try to distribute it. We
	// don't want to distribute the nullability specifier past any
	// dependent type, because that complicates the user model.
	if (type->canHaveNullability() \|\| type->isDependentType() \|\|
	type->isArrayType() \|\|
	!distributeNullabilityTypeAttr(state, type, attr)) {
	unsigned endIndex;
	if (TAL == TAL_DeclChunk)
	endIndex = state.getCurrentChunkIndex();
	else
	endIndex = state.getDeclarator().getNumTypeObjects();
	bool allowOnArrayType =
	state.getDeclarator().isPrototypeContext() &&
	!hasOuterPointerLikeChunk(state.getDeclarator(), endIndex);
	if (checkNullabilityTypeSpecifier(
	state,
	type,
	attr,
	allowOnArrayType)) {
	attr.setInvalid();
	}

	attr.setUsedAsTypeAttr();
	}
	break;

	case ParsedAttr::AT_ObjCKindOf:
	// '__kindof' must be part of the decl-specifiers.
	switch (TAL) {
	case TAL_DeclSpec:
	break;

	case TAL_DeclChunk:
	case TAL_DeclName:
	state.getSema().Diag(attr.getLoc(),
	diag::err_objc_kindof_wrong_position)
	<< FixItHint::CreateRemoval(attr.getLoc())
	<< FixItHint::CreateInsertion(
	state.getDeclarator().getDeclSpec().getBeginLoc(),
	"__kindof ");
	break;
	}

	// Apply it regardless.
	if (checkObjCKindOfType(state, type, attr))
	attr.setInvalid();
	break;

	case ParsedAttr::AT_NoThrow:
	// Exception Specifications aren't generally supported in C mode throughout
	// clang, so revert to attribute-based handling for C.
	if (!state.getSema().getLangOpts().CPlusPlus)
	break;
	LLVM_FALLTHROUGH;
	FUNCTION_TYPE_ATTRS_CASELIST:
	attr.setUsedAsTypeAttr();

	// Never process function type attributes as part of the
	// declaration-specifiers.
	if (TAL == TAL_DeclSpec)
	distributeFunctionTypeAttrFromDeclSpec(state, attr, type);

	// Otherwise, handle the possible delays.
	else if (!handleFunctionTypeAttr(state, attr, type))
	distributeFunctionTypeAttr(state, attr, type);
	break;
	}

	// Handle attributes that are defined in a macro. We do not want this to be
	// applied to ObjC builtin attributes.
	if (isa<AttributedType>(type) && attr.hasMacroIdentifier() &&
	!type.getQualifiers().hasObjCLifetime() &&
	!type.getQualifiers().hasObjCGCAttr() &&
	attr.getKind() != ParsedAttr::AT_ObjCGC &&
	attr.getKind() != ParsedAttr::AT_ObjCOwnership) {
	const IdentifierInfo *MacroII = attr.getMacroIdentifier();
	type = state.getSema().Context.getMacroQualifiedType(type, MacroII);
	state.setExpansionLocForMacroQualifiedType(
	cast<MacroQualifiedType>(type.getTypePtr()),
	attr.getMacroExpansionLoc());
	}
	}

	if (!state.getSema().getLangOpts().OpenCL \|\|
	type.getAddressSpace() != LangAS::Default)
	return;

	deduceOpenCLImplicitAddrSpace(state, type, TAL);
	}

	void Sema::completeExprArrayBound(Expr *E) {
	if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E->IgnoreParens())) {
	if (VarDecl *Var = dyn_cast<VarDecl>(DRE->getDecl())) {
	if (isTemplateInstantiation(Var->getTemplateSpecializationKind())) {
	auto *Def = Var->getDefinition();
	if (!Def) {
	SourceLocation PointOfInstantiation = E->getExprLoc();
	InstantiateVariableDefinition(PointOfInstantiation, Var);
	Def = Var->getDefinition();

	// If we don't already have a point of instantiation, and we managed
	// to instantiate a definition, this is the point of instantiation.
	// Otherwise, we don't request an end-of-TU instantiation, so this is
	// not a point of instantiation.
	// FIXME: Is this really the right behavior?
	if (Var->getPointOfInstantiation().isInvalid() && Def) {
	assert(Var->getTemplateSpecializationKind() ==
	TSK_ImplicitInstantiation &&
	"explicit instantiation with no point of instantiation");
	Var->setTemplateSpecializationKind(
	Var->getTemplateSpecializationKind(), PointOfInstantiation);
	}
	}

	// Update the type to the definition's type both here and within the
	// expression.
	if (Def) {
	DRE->setDecl(Def);
	QualType T = Def->getType();
	DRE->setType(T);
	// FIXME: Update the type on all intervening expressions.
	E->setType(T);
	}

	// We still go on to try to complete the type independently, as it
	// may also require instantiations or diagnostics if it remains
	// incomplete.
	}
	}
	}
	}

	/// Ensure that the type of the given expression is complete.
	///
	/// This routine checks whether the expression \p E has a complete type. If the
	/// expression refers to an instantiable construct, that instantiation is
	/// performed as needed to complete its type. Furthermore
	/// Sema::RequireCompleteType is called for the expression's type (or in the
	/// case of a reference type, the referred-to type).
	///
	/// \param E The expression whose type is required to be complete.
	/// \param Diagnoser The object that will emit a diagnostic if the type is
	/// incomplete.
	///
	/// \returns \c true if the type of \p E is incomplete and diagnosed, \c false
	/// otherwise.
	bool Sema::RequireCompleteExprType(Expr *E, TypeDiagnoser &Diagnoser) {
	QualType T = E->getType();

	// Incomplete array types may be completed by the initializer attached to
	// their definitions. For static data members of class templates and for
	// variable templates, we need to instantiate the definition to get this
	// initializer and complete the type.
	if (T->isIncompleteArrayType()) {
	completeExprArrayBound(E);
	T = E->getType();
	}

	// FIXME: Are there other cases which require instantiating something other
	// than the type to complete the type of an expression?

	return RequireCompleteType(E->getExprLoc(), T, Diagnoser);
	}

	bool Sema::RequireCompleteExprType(Expr *E, unsigned DiagID) {
	BoundTypeDiagnoser<> Diagnoser(DiagID);
	return RequireCompleteExprType(E, Diagnoser);
	}

	/// Ensure that the type T is a complete type.
	///
	/// This routine checks whether the type @p T is complete in any
	/// context where a complete type is required. If @p T is a complete
	/// type, returns false. If @p T is a class template specialization,
	/// this routine then attempts to perform class template
	/// instantiation. If instantiation fails, or if @p T is incomplete
	/// and cannot be completed, issues the diagnostic @p diag (giving it
	/// the type @p T) and returns true.
	///
	/// @param Loc The location in the source that the incomplete type
	/// diagnostic should refer to.
	///
	/// @param T The type that this routine is examining for completeness.
	///
	/// @returns @c true if @p T is incomplete and a diagnostic was emitted,
	/// @c false otherwise.
	bool Sema::RequireCompleteType(SourceLocation Loc, QualType T,
	TypeDiagnoser &Diagnoser) {
	if (RequireCompleteTypeImpl(Loc, T, &Diagnoser))
	return true;
	if (const TagType *Tag = T->getAs<TagType>()) {
	if (!Tag->getDecl()->isCompleteDefinitionRequired()) {
	Tag->getDecl()->setCompleteDefinitionRequired();
	Consumer.HandleTagDeclRequiredDefinition(Tag->getDecl());
	}
	}
	return false;
	}

	bool Sema::hasStructuralCompatLayout(Decl D, Decl Suggested) {
	llvm::DenseSet<std::pair<Decl , Decl >> NonEquivalentDecls;
	if (!Suggested)
	return false;

	// FIXME: Add a specific mode for C11 6.2.7/1 in StructuralEquivalenceContext
	// and isolate from other C++ specific checks.
	StructuralEquivalenceContext Ctx(
	D->getASTContext(), Suggested->getASTContext(), NonEquivalentDecls,
	StructuralEquivalenceKind::Default,
	false /StrictTypeSpelling/, true /Complain/,
	true /ErrorOnTagTypeMismatch/);
	return Ctx.IsEquivalent(D, Suggested);
	}

	/// Determine whether there is any declaration of \p D that was ever a
	/// definition (perhaps before module merging) and is currently visible.
	/// \param D The definition of the entity.
	/// \param Suggested Filled in with the declaration that should be made visible
	/// in order to provide a definition of this entity.
	/// \param OnlyNeedComplete If \c true, we only need the type to be complete,
	/// not defined. This only matters for enums with a fixed underlying
	/// type, since in all other cases, a type is complete if and only if it
	/// is defined.
	bool Sema::hasVisibleDefinition(NamedDecl D, NamedDecl *Suggested,
	bool OnlyNeedComplete) {
	// Easy case: if we don't have modules, all declarations are visible.
	if (!getLangOpts().Modules && !getLangOpts().ModulesLocalVisibility)
	return true;

	// If this definition was instantiated from a template, map back to the
	// pattern from which it was instantiated.
	if (isa<TagDecl>(D) && cast<TagDecl>(D)->isBeingDefined()) {
	// We're in the middle of defining it; this definition should be treated
	// as visible.
	return true;
	} else if (auto *RD = dyn_cast<CXXRecordDecl>(D)) {
	if (auto *Pattern = RD->getTemplateInstantiationPattern())
	RD = Pattern;
	D = RD->getDefinition();
	} else if (auto *ED = dyn_cast<EnumDecl>(D)) {
	if (auto *Pattern = ED->getTemplateInstantiationPattern())
	ED = Pattern;
	if (OnlyNeedComplete && ED->isFixed()) {
	// If the enum has a fixed underlying type, and we're only looking for a
	// complete type (not a definition), any visible declaration of it will
	// do.
	*Suggested = nullptr;
	for (auto *Redecl : ED->redecls()) {
	if (isVisible(Redecl))
	return true;
	if (Redecl->isThisDeclarationADefinition() \|\|
	(Redecl->isCanonicalDecl() && !*Suggested))
	*Suggested = Redecl;
	}
	return false;
	}
	D = ED->getDefinition();
	} else if (auto *FD = dyn_cast<FunctionDecl>(D)) {
	if (auto *Pattern = FD->getTemplateInstantiationPattern())
	FD = Pattern;
	D = FD->getDefinition();
	} else if (auto *VD = dyn_cast<VarDecl>(D)) {
	if (auto *Pattern = VD->getTemplateInstantiationPattern())
	VD = Pattern;
	D = VD->getDefinition();
	}
	assert(D && "missing definition for pattern of instantiated definition");

	*Suggested = D;

	auto DefinitionIsVisible = [&] {
	// The (primary) definition might be in a visible module.
	if (isVisible(D))
	return true;

	// A visible module might have a merged definition instead.
	if (D->isModulePrivate() ? hasMergedDefinitionInCurrentModule(D)
	: hasVisibleMergedDefinition(D)) {
	if (CodeSynthesisContexts.empty() &&
	!getLangOpts().ModulesLocalVisibility) {
	// Cache the fact that this definition is implicitly visible because
	// there is a visible merged definition.
	D->setVisibleDespiteOwningModule();
	}
	return true;
	}

	return false;
	};

	if (DefinitionIsVisible())
	return true;

	// The external source may have additional definitions of this entity that are
	// visible, so complete the redeclaration chain now and ask again.
	if (auto *Source = Context.getExternalSource()) {
	Source->CompleteRedeclChain(D);
	return DefinitionIsVisible();
	}

	return false;
	}

	/// Locks in the inheritance model for the given class and all of its bases.
	static void assignInheritanceModel(Sema &S, CXXRecordDecl *RD) {
	RD = RD->getMostRecentNonInjectedDecl();
	if (!RD->hasAttr<MSInheritanceAttr>()) {
	MSInheritanceAttr::Spelling IM;

	switch (S.MSPointerToMemberRepresentationMethod) {
	case LangOptions::PPTMK_BestCase:
	IM = RD->calculateInheritanceModel();
	break;
	case LangOptions::PPTMK_FullGeneralitySingleInheritance:
	IM = MSInheritanceAttr::Keyword_single_inheritance;
	break;
	case LangOptions::PPTMK_FullGeneralityMultipleInheritance:
	IM = MSInheritanceAttr::Keyword_multiple_inheritance;
	break;
	case LangOptions::PPTMK_FullGeneralityVirtualInheritance:
	IM = MSInheritanceAttr::Keyword_unspecified_inheritance;
	break;
	}

	RD->addAttr(MSInheritanceAttr::CreateImplicit(
	S.getASTContext(), IM,
	/BestCase=/S.MSPointerToMemberRepresentationMethod ==
	LangOptions::PPTMK_BestCase,
	S.ImplicitMSInheritanceAttrLoc.isValid()
	? S.ImplicitMSInheritanceAttrLoc
	: RD->getSourceRange()));
	S.Consumer.AssignInheritanceModel(RD);
	}
	}

	/// The implementation of RequireCompleteType
	bool Sema::RequireCompleteTypeImpl(SourceLocation Loc, QualType T,
	TypeDiagnoser *Diagnoser) {
	// FIXME: Add this assertion to make sure we always get instantiation points.
	// assert(!Loc.isInvalid() && "Invalid location in RequireCompleteType");
	// FIXME: Add this assertion to help us flush out problems with
	// checking for dependent types and type-dependent expressions.
	//
	// assert(!T->isDependentType() &&
	// "Can't ask whether a dependent type is complete");

	if (const MemberPointerType *MPTy = T->getAs<MemberPointerType>()) {
	if (!MPTy->getClass()->isDependentType()) {
	if (getLangOpts().CompleteMemberPointers &&
	!MPTy->getClass()->getAsCXXRecordDecl()->isBeingDefined() &&
	RequireCompleteType(Loc, QualType(MPTy->getClass(), 0),
	diag::err_memptr_incomplete))
	return true;

	// We lock in the inheritance model once somebody has asked us to ensure
	// that a pointer-to-member type is complete.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft()) {
	(void)isCompleteType(Loc, QualType(MPTy->getClass(), 0));
	assignInheritanceModel(*this, MPTy->getMostRecentCXXRecordDecl());
	}
	}
	}

	NamedDecl *Def = nullptr;
	bool Incomplete = T->isIncompleteType(&Def);

	// Check that any necessary explicit specializations are visible. For an
	// enum, we just need the declaration, so don't check this.
	if (Def && !isa<EnumDecl>(Def))
	checkSpecializationVisibility(Loc, Def);

	// If we have a complete type, we're done.
	if (!Incomplete) {
	// If we know about the definition but it is not visible, complain.
	NamedDecl *SuggestedDef = nullptr;
	if (Def &&
	!hasVisibleDefinition(Def, &SuggestedDef, /OnlyNeedComplete/true)) {
	// If the user is going to see an error here, recover by making the
	// definition visible.
	bool TreatAsComplete = Diagnoser && !isSFINAEContext();
	if (Diagnoser && SuggestedDef)
	diagnoseMissingImport(Loc, SuggestedDef, MissingImportKind::Definition,
	/Recover/TreatAsComplete);
	return !TreatAsComplete;
	} else if (Def && !TemplateInstCallbacks.empty()) {
	CodeSynthesisContext TempInst;
	TempInst.Kind = CodeSynthesisContext::Memoization;
	TempInst.Template = Def;
	TempInst.Entity = Def;
	TempInst.PointOfInstantiation = Loc;
	atTemplateBegin(TemplateInstCallbacks, *this, TempInst);
	atTemplateEnd(TemplateInstCallbacks, *this, TempInst);
	}

	return false;
	}

	TagDecl *Tag = dyn_cast_or_null<TagDecl>(Def);
	ObjCInterfaceDecl *IFace = dyn_cast_or_null<ObjCInterfaceDecl>(Def);

	// Give the external source a chance to provide a definition of the type.
	// This is kept separate from completing the redeclaration chain so that
	// external sources such as LLDB can avoid synthesizing a type definition
	// unless it's actually needed.
	if (Tag \|\| IFace) {
	// Avoid diagnosing invalid decls as incomplete.
	if (Def->isInvalidDecl())
	return true;

	// Give the external AST source a chance to complete the type.
	if (auto *Source = Context.getExternalSource()) {
	if (Tag && Tag->hasExternalLexicalStorage())
	Source->CompleteType(Tag);
	if (IFace && IFace->hasExternalLexicalStorage())
	Source->CompleteType(IFace);
	// If the external source completed the type, go through the motions
	// again to ensure we're allowed to use the completed type.
	if (!T->isIncompleteType())
	return RequireCompleteTypeImpl(Loc, T, Diagnoser);
	}
	}

	// If we have a class template specialization or a class member of a
	// class template specialization, or an array with known size of such,
	// try to instantiate it.
	if (auto *RD = dyn_cast_or_null<CXXRecordDecl>(Tag)) {
	bool Instantiated = false;
	bool Diagnosed = false;
	if (RD->isDependentContext()) {
	// Don't try to instantiate a dependent class (eg, a member template of
	// an instantiated class template specialization).
	// FIXME: Can this ever happen?
	} else if (auto *ClassTemplateSpec =
	dyn_cast<ClassTemplateSpecializationDecl>(RD)) {
	if (ClassTemplateSpec->getSpecializationKind() == TSK_Undeclared) {
	Diagnosed = InstantiateClassTemplateSpecialization(
	Loc, ClassTemplateSpec, TSK_ImplicitInstantiation,
	/Complain=/Diagnoser);
	Instantiated = true;
	}
	} else {
	CXXRecordDecl *Pattern = RD->getInstantiatedFromMemberClass();
	if (!RD->isBeingDefined() && Pattern) {
	MemberSpecializationInfo *MSI = RD->getMemberSpecializationInfo();
	assert(MSI && "Missing member specialization information?");
	// This record was instantiated from a class within a template.
	if (MSI->getTemplateSpecializationKind() !=
	TSK_ExplicitSpecialization) {
	Diagnosed = InstantiateClass(Loc, RD, Pattern,
	getTemplateInstantiationArgs(RD),
	TSK_ImplicitInstantiation,
	/Complain=/Diagnoser);
	Instantiated = true;
	}
	}
	}

	if (Instantiated) {
	// Instantiate* might have already complained that the template is not
	// defined, if we asked it to.
	if (Diagnoser && Diagnosed)
	return true;
	// If we instantiated a definition, check that it's usable, even if
	// instantiation produced an error, so that repeated calls to this
	// function give consistent answers.
	if (!T->isIncompleteType())
	return RequireCompleteTypeImpl(Loc, T, Diagnoser);
	}
	}

	// FIXME: If we didn't instantiate a definition because of an explicit
	// specialization declaration, check that it's visible.

	if (!Diagnoser)
	return true;

	Diagnoser->diagnose(*this, Loc, T);

	// If the type was a forward declaration of a class/struct/union
	// type, produce a note.
	if (Tag && !Tag->isInvalidDecl())
	Diag(Tag->getLocation(),
	Tag->isBeingDefined() ? diag::note_type_being_defined
	: diag::note_forward_declaration)
	<< Context.getTagDeclType(Tag);

	// If the Objective-C class was a forward declaration, produce a note.
	if (IFace && !IFace->isInvalidDecl())
	Diag(IFace->getLocation(), diag::note_forward_class);

	// If we have external information that we can use to suggest a fix,
	// produce a note.
	if (ExternalSource)
	ExternalSource->MaybeDiagnoseMissingCompleteType(Loc, T);

	return true;
	}

	bool Sema::RequireCompleteType(SourceLocation Loc, QualType T,
	unsigned DiagID) {
	BoundTypeDiagnoser<> Diagnoser(DiagID);
	return RequireCompleteType(Loc, T, Diagnoser);
	}

	/// Get diagnostic %select index for tag kind for
	/// literal type diagnostic message.
	/// WARNING: Indexes apply to particular diagnostics only!
	///
	/// \returns diagnostic %select index.
	static unsigned getLiteralDiagFromTagKind(TagTypeKind Tag) {
	switch (Tag) {
	case TTK_Struct: return 0;
	case TTK_Interface: return 1;
	case TTK_Class: return 2;
	default: llvm_unreachable("Invalid tag kind for literal type diagnostic!");
	}
	}

	/// Ensure that the type T is a literal type.
	///
	/// This routine checks whether the type @p T is a literal type. If @p T is an
	/// incomplete type, an attempt is made to complete it. If @p T is a literal
	/// type, or @p AllowIncompleteType is true and @p T is an incomplete type,
	/// returns false. Otherwise, this routine issues the diagnostic @p PD (giving
	/// it the type @p T), along with notes explaining why the type is not a
	/// literal type, and returns true.
	///
	/// @param Loc The location in the source that the non-literal type
	/// diagnostic should refer to.
	///
	/// @param T The type that this routine is examining for literalness.
	///
	/// @param Diagnoser Emits a diagnostic if T is not a literal type.
	///
	/// @returns @c true if @p T is not a literal type and a diagnostic was emitted,
	/// @c false otherwise.
	bool Sema::RequireLiteralType(SourceLocation Loc, QualType T,
	TypeDiagnoser &Diagnoser) {
	assert(!T->isDependentType() && "type should not be dependent");

	QualType ElemType = Context.getBaseElementType(T);
	if ((isCompleteType(Loc, ElemType) \|\| ElemType->isVoidType()) &&
	T->isLiteralType(Context))
	return false;

	Diagnoser.diagnose(*this, Loc, T);

	if (T->isVariableArrayType())
	return true;

	const RecordType *RT = ElemType->getAs<RecordType>();
	if (!RT)
	return true;

	const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getDecl());

	// A partially-defined class type can't be a literal type, because a literal
	// class type must have a trivial destructor (which can't be checked until
	// the class definition is complete).
	if (RequireCompleteType(Loc, ElemType, diag::note_non_literal_incomplete, T))
	return true;

	// [expr.prim.lambda]p3:
	// This class type is [not] a literal type.
	if (RD->isLambda() && !getLangOpts().CPlusPlus17) {
	Diag(RD->getLocation(), diag::note_non_literal_lambda);
	return true;
	}

	// If the class has virtual base classes, then it's not an aggregate, and
	// cannot have any constexpr constructors or a trivial default constructor,
	// so is non-literal. This is better to diagnose than the resulting absence
	// of constexpr constructors.
	if (RD->getNumVBases()) {
	Diag(RD->getLocation(), diag::note_non_literal_virtual_base)
	<< getLiteralDiagFromTagKind(RD->getTagKind()) << RD->getNumVBases();
	for (const auto &I : RD->vbases())
	Diag(I.getBeginLoc(), diag::note_constexpr_virtual_base_here)
	<< I.getSourceRange();
	} else if (!RD->isAggregate() && !RD->hasConstexprNonCopyMoveConstructor() &&
	!RD->hasTrivialDefaultConstructor()) {
	Diag(RD->getLocation(), diag::note_non_literal_no_constexpr_ctors) << RD;
	} else if (RD->hasNonLiteralTypeFieldsOrBases()) {
	for (const auto &I : RD->bases()) {
	if (!I.getType()->isLiteralType(Context)) {
	Diag(I.getBeginLoc(), diag::note_non_literal_base_class)
	<< RD << I.getType() << I.getSourceRange();
	return true;
	}
	}
	for (const auto *I : RD->fields()) {
	if (!I->getType()->isLiteralType(Context) \|\|
	I->getType().isVolatileQualified()) {
	Diag(I->getLocation(), diag::note_non_literal_field)
	<< RD << I << I->getType()
	<< I->getType().isVolatileQualified();
	return true;
	}
	}
	} else if (!RD->hasTrivialDestructor()) {
	// All fields and bases are of literal types, so have trivial destructors.
	// If this class's destructor is non-trivial it must be user-declared.
	CXXDestructorDecl *Dtor = RD->getDestructor();
	assert(Dtor && "class has literal fields and bases but no dtor?");
	if (!Dtor)
	return true;

	Diag(Dtor->getLocation(), Dtor->isUserProvided() ?
	diag::note_non_literal_user_provided_dtor :
	diag::note_non_literal_nontrivial_dtor) << RD;
	if (!Dtor->isUserProvided())
	SpecialMemberIsTrivial(Dtor, CXXDestructor, TAH_IgnoreTrivialABI,
	/Diagnose/true);
	}

	return true;
	}

	bool Sema::RequireLiteralType(SourceLocation Loc, QualType T, unsigned DiagID) {
	BoundTypeDiagnoser<> Diagnoser(DiagID);
	return RequireLiteralType(Loc, T, Diagnoser);
	}

	/// Retrieve a version of the type 'T' that is elaborated by Keyword, qualified
	/// by the nested-name-specifier contained in SS, and that is (re)declared by
	/// OwnedTagDecl, which is nullptr if this is not a (re)declaration.
	QualType Sema::getElaboratedType(ElaboratedTypeKeyword Keyword,
	const CXXScopeSpec &SS, QualType T,
	TagDecl *OwnedTagDecl) {
	if (T.isNull())
	return T;
	NestedNameSpecifier *NNS;
	if (SS.isValid())
	NNS = SS.getScopeRep();
	else {
	if (Keyword == ETK_None)
	return T;
	NNS = nullptr;
	}
	return Context.getElaboratedType(Keyword, NNS, T, OwnedTagDecl);
	}

	QualType Sema::BuildTypeofExprType(Expr *E, SourceLocation Loc) {
	assert(!E->hasPlaceholderType() && "unexpected placeholder");

	if (!getLangOpts().CPlusPlus && E->refersToBitField())
	Diag(E->getExprLoc(), diag::err_sizeof_alignof_typeof_bitfield) << 2;

	if (!E->isTypeDependent()) {
	QualType T = E->getType();
	if (const TagType *TT = T->getAs<TagType>())
	DiagnoseUseOfDecl(TT->getDecl(), E->getExprLoc());
	}
	return Context.getTypeOfExprType(E);
	}

	/// getDecltypeForExpr - Given an expr, will return the decltype for
	/// that expression, according to the rules in C++11
	/// [dcl.type.simple]p4 and C++11 [expr.lambda.prim]p18.
	static QualType getDecltypeForExpr(Sema &S, Expr *E) {
	if (E->isTypeDependent())
	return S.Context.DependentTy;

	// C++11 [dcl.type.simple]p4:
	// The type denoted by decltype(e) is defined as follows:
	//
	// - if e is an unparenthesized id-expression or an unparenthesized class
	// member access (5.2.5), decltype(e) is the type of the entity named
	// by e. If there is no such entity, or if e names a set of overloaded
	// functions, the program is ill-formed;
	//
	// We apply the same rules for Objective-C ivar and property references.
	if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
	const ValueDecl *VD = DRE->getDecl();
	return VD->getType();
	} else if (const MemberExpr *ME = dyn_cast<MemberExpr>(E)) {
	if (const ValueDecl *VD = ME->getMemberDecl())
	if (isa<FieldDecl>(VD) \|\| isa<VarDecl>(VD))
	return VD->getType();
	} else if (const ObjCIvarRefExpr *IR = dyn_cast<ObjCIvarRefExpr>(E)) {
	return IR->getDecl()->getType();
	} else if (const ObjCPropertyRefExpr *PR = dyn_cast<ObjCPropertyRefExpr>(E)) {
	if (PR->isExplicitProperty())
	return PR->getExplicitProperty()->getType();
	} else if (auto *PE = dyn_cast<PredefinedExpr>(E)) {
	return PE->getType();
	}

	// C++11 [expr.lambda.prim]p18:
	// Every occurrence of decltype((x)) where x is a possibly
	// parenthesized id-expression that names an entity of automatic
	// storage duration is treated as if x were transformed into an
	// access to a corresponding data member of the closure type that
	// would have been declared if x were an odr-use of the denoted
	// entity.
	using namespace sema;
	if (S.getCurLambda()) {
	if (isa<ParenExpr>(E)) {
	if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E->IgnoreParens())) {
	if (VarDecl *Var = dyn_cast<VarDecl>(DRE->getDecl())) {
	QualType T = S.getCapturedDeclRefType(Var, DRE->getLocation());
	if (!T.isNull())
	return S.Context.getLValueReferenceType(T);
	}
	}
	}
	}


	// C++11 [dcl.type.simple]p4:
	// [...]
	QualType T = E->getType();
	switch (E->getValueKind()) {
	// - otherwise, if e is an xvalue, decltype(e) is T&&, where T is the
	// type of e;
	case VK_XValue: T = S.Context.getRValueReferenceType(T); break;
	// - otherwise, if e is an lvalue, decltype(e) is T&, where T is the
	// type of e;
	case VK_LValue: T = S.Context.getLValueReferenceType(T); break;
	// - otherwise, decltype(e) is the type of e.
	case VK_RValue: break;
	}

	return T;
	}

	QualType Sema::BuildDecltypeType(Expr *E, SourceLocation Loc,
	bool AsUnevaluated) {
	assert(!E->hasPlaceholderType() && "unexpected placeholder");

	if (AsUnevaluated && CodeSynthesisContexts.empty() &&
	E->HasSideEffects(Context, false)) {
	// The expression operand for decltype is in an unevaluated expression
	// context, so side effects could result in unintended consequences.
	Diag(E->getExprLoc(), diag::warn_side_effects_unevaluated_context);
	}

	return Context.getDecltypeType(E, getDecltypeForExpr(*this, E));
	}

	QualType Sema::BuildUnaryTransformType(QualType BaseType,
	UnaryTransformType::UTTKind UKind,
	SourceLocation Loc) {
	switch (UKind) {
	case UnaryTransformType::EnumUnderlyingType:
	if (!BaseType->isDependentType() && !BaseType->isEnumeralType()) {
	Diag(Loc, diag::err_only_enums_have_underlying_types);
	return QualType();
	} else {
	QualType Underlying = BaseType;
	if (!BaseType->isDependentType()) {
	// The enum could be incomplete if we're parsing its definition or
	// recovering from an error.
	NamedDecl *FwdDecl = nullptr;
	if (BaseType->isIncompleteType(&FwdDecl)) {
	Diag(Loc, diag::err_underlying_type_of_incomplete_enum) << BaseType;
	Diag(FwdDecl->getLocation(), diag::note_forward_declaration) << FwdDecl;
	return QualType();
	}

	EnumDecl *ED = BaseType->getAs<EnumType>()->getDecl();
	assert(ED && "EnumType has no EnumDecl");

	DiagnoseUseOfDecl(ED, Loc);

	Underlying = ED->getIntegerType();
	assert(!Underlying.isNull());
	}
	return Context.getUnaryTransformType(BaseType, Underlying,
	UnaryTransformType::EnumUnderlyingType);
	}
	}
	llvm_unreachable("unknown unary transform type");
	}

	QualType Sema::BuildAtomicType(QualType T, SourceLocation Loc) {
	if (!T->isDependentType()) {
	// FIXME: It isn't entirely clear whether incomplete atomic types
	// are allowed or not; for simplicity, ban them for the moment.
	if (RequireCompleteType(Loc, T, diag::err_atomic_specifier_bad_type, 0))
	return QualType();

	int DisallowedKind = -1;
	if (T->isArrayType())
	DisallowedKind = 1;
	else if (T->isFunctionType())
	DisallowedKind = 2;
	else if (T->isReferenceType())
	DisallowedKind = 3;
	else if (T->isAtomicType())
	DisallowedKind = 4;
	else if (T.hasQualifiers())
	DisallowedKind = 5;
	else if (!T.isTriviallyCopyableType(Context))
	// Some other non-trivially-copyable type (probably a C++ class)
	DisallowedKind = 6;

	if (DisallowedKind != -1) {
	Diag(Loc, diag::err_atomic_specifier_bad_type) << DisallowedKind << T;
	return QualType();
	}

	// FIXME: Do we need any handling for ARC here?
	}

	// Build the pointer type.
	return Context.getAtomicType(T);
	}
	Index: projects/clang900-import/contrib/llvm/tools/clang
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/clang (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/clang (revision 351722)

	Property changes on: projects/clang900-import/contrib/llvm/tools/clang
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/clang/dist-release_90:r351684-351721
	Index: projects/clang900-import/contrib/llvm/tools/lld/CMakeLists.txt
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/CMakeLists.txt (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/CMakeLists.txt (revision 351722)
	@@ -1,226 +1,225 @@
	# Check if lld is built as a standalone project.
	if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
	project(lld)
	cmake_minimum_required(VERSION 3.4.3)

	set(CMAKE_INCLUDE_CURRENT_DIR ON)
	set(LLD_BUILT_STANDALONE TRUE)

	find_program(LLVM_CONFIG_PATH "llvm-config" DOC "Path to llvm-config binary")
	if(NOT LLVM_CONFIG_PATH)
	message(FATAL_ERROR "llvm-config not found: specify LLVM_CONFIG_PATH")
	endif()

	execute_process(COMMAND "${LLVM_CONFIG_PATH}"
	"--obj-root"
	"--includedir"
	"--cmakedir"
	"--src-root"
	RESULT_VARIABLE HAD_ERROR
	OUTPUT_VARIABLE LLVM_CONFIG_OUTPUT
	OUTPUT_STRIP_TRAILING_WHITESPACE)
	if(HAD_ERROR)
	message(FATAL_ERROR "llvm-config failed with status ${HAD_ERROR}")
	endif()

	string(REGEX REPLACE "[ \t][\r\n]+[ \t]" ";" LLVM_CONFIG_OUTPUT "${LLVM_CONFIG_OUTPUT}")

	list(GET LLVM_CONFIG_OUTPUT 0 OBJ_ROOT)
	list(GET LLVM_CONFIG_OUTPUT 1 MAIN_INCLUDE_DIR)
	list(GET LLVM_CONFIG_OUTPUT 2 LLVM_CMAKE_PATH)
	list(GET LLVM_CONFIG_OUTPUT 3 MAIN_SRC_DIR)

	set(LLVM_OBJ_ROOT ${OBJ_ROOT} CACHE PATH "path to LLVM build tree")
	set(LLVM_MAIN_INCLUDE_DIR ${MAIN_INCLUDE_DIR} CACHE PATH "path to llvm/include")
	set(LLVM_MAIN_SRC_DIR ${MAIN_SRC_DIR} CACHE PATH "Path to LLVM source tree")

	file(TO_CMAKE_PATH ${LLVM_OBJ_ROOT} LLVM_BINARY_DIR)

	if(NOT EXISTS "${LLVM_CMAKE_PATH}/LLVMConfig.cmake")
	message(FATAL_ERROR "LLVMConfig.cmake not found")
	endif()
	include("${LLVM_CMAKE_PATH}/LLVMConfig.cmake")

	list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_PATH}")

	set(PACKAGE_VERSION "${LLVM_PACKAGE_VERSION}")
	include_directories("${LLVM_BINARY_DIR}/include" ${LLVM_INCLUDE_DIRS})
	link_directories(${LLVM_LIBRARY_DIRS})

	set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
	set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
	find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)

	include(AddLLVM)
	include(TableGen)
	include(HandleLLVMOptions)

	if(LLVM_INCLUDE_TESTS)
	- set(Python_ADDITIONAL_VERSIONS 2.7)
	include(FindPythonInterp)
	if(NOT PYTHONINTERP_FOUND)
	message(FATAL_ERROR
	"Unable to find Python interpreter, required for testing.

	Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
	endif()

	if(${PYTHON_VERSION_STRING} VERSION_LESS 2.7)
	message(FATAL_ERROR "Python 2.7 or newer is required")
	endif()

	# Check prebuilt llvm/utils.
	if(EXISTS ${LLVM_TOOLS_BINARY_DIR}/FileCheck${CMAKE_EXECUTABLE_SUFFIX}
	AND EXISTS ${LLVM_TOOLS_BINARY_DIR}/not${CMAKE_EXECUTABLE_SUFFIX})
	set(LLVM_UTILS_PROVIDED ON)
	endif()

	if(EXISTS ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
	# Note: path not really used, except for checking if lit was found
	set(LLVM_LIT ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
	if(NOT LLVM_UTILS_PROVIDED)
	add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/FileCheck utils/FileCheck)
	add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/not utils/not)
	set(LLVM_UTILS_PROVIDED ON)
	set(LLD_TEST_DEPS FileCheck not)
	endif()
	set(UNITTEST_DIR ${LLVM_MAIN_SRC_DIR}/utils/unittest)
	if(EXISTS ${UNITTEST_DIR}/googletest/include/gtest/gtest.h
	AND NOT EXISTS ${LLVM_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}
	AND EXISTS ${UNITTEST_DIR}/CMakeLists.txt)
	add_subdirectory(${UNITTEST_DIR} utils/unittest)
	endif()
	else()
	# Seek installed Lit.
	find_program(LLVM_LIT
	NAMES llvm-lit lit.py lit
	PATHS "${LLVM_MAIN_SRC_DIR}/utils/lit"
	DOC "Path to lit.py")
	endif()

	if(LLVM_LIT)
	# Define the default arguments to use with 'lit', and an option for the user
	# to override.
	set(LIT_ARGS_DEFAULT "-sv")
	if (MSVC OR XCODE)
	set(LIT_ARGS_DEFAULT "${LIT_ARGS_DEFAULT} --no-progress-bar")
	endif()
	set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")

	# On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
	if(WIN32 AND NOT CYGWIN)
	set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
	endif()
	else()
	set(LLVM_INCLUDE_TESTS OFF)
	endif()
	endif()
	endif()

	set(LLD_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
	set(LLD_INCLUDE_DIR ${LLD_SOURCE_DIR}/include )
	set(LLD_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

	# Compute the LLD version from the LLVM version.
	string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" LLD_VERSION
	${PACKAGE_VERSION})
	message(STATUS "LLD version: ${LLD_VERSION}")

	string(REGEX REPLACE "([0-9]+)\\.[0-9]+(\\.[0-9]+)?" "\\1" LLD_VERSION_MAJOR
	${LLD_VERSION})
	string(REGEX REPLACE "[0-9]+\\.([0-9]+)(\\.[0-9]+)?" "\\1" LLD_VERSION_MINOR
	${LLD_VERSION})

	# Determine LLD revision and repository.
	# TODO: Figure out a way to get the revision and the repository on windows.
	if ( NOT CMAKE_SYSTEM_NAME MATCHES "Windows" )
	execute_process(COMMAND ${CMAKE_SOURCE_DIR}/utils/GetSourceVersion ${LLD_SOURCE_DIR}
	OUTPUT_VARIABLE LLD_REVISION)

	execute_process(COMMAND ${CMAKE_SOURCE_DIR}/utils/GetRepositoryPath ${LLD_SOURCE_DIR}
	OUTPUT_VARIABLE LLD_REPOSITORY)
	if ( LLD_REPOSITORY )
	# Replace newline characters with spaces
	string(REGEX REPLACE "(\r?\n)+" " " LLD_REPOSITORY ${LLD_REPOSITORY})
	# Remove leading spaces
	STRING(REGEX REPLACE "^[ \t\r\n]+" "" LLD_REPOSITORY "${LLD_REPOSITORY}" )
	# Remove trailing spaces
	string(REGEX REPLACE "(\ )+$" "" LLD_REPOSITORY ${LLD_REPOSITORY})
	endif()

	if ( LLD_REVISION )
	# Replace newline characters with spaces
	string(REGEX REPLACE "(\r?\n)+" " " LLD_REVISION ${LLD_REVISION})
	# Remove leading spaces
	STRING(REGEX REPLACE "^[ \t\r\n]+" "" LLD_REVISION "${LLD_REVISION}" )
	# Remove trailing spaces
	string(REGEX REPLACE "(\ )+$" "" LLD_REVISION ${LLD_REVISION})
	endif()
	endif ()

	# Configure the Version.inc file.
	configure_file(
	${CMAKE_CURRENT_SOURCE_DIR}/include/lld/Common/Version.inc.in
	${CMAKE_CURRENT_BINARY_DIR}/include/lld/Common/Version.inc)


	if (CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR)
	message(FATAL_ERROR "In-source builds are not allowed. CMake would overwrite "
	"the makefiles distributed with LLVM. Please create a directory and run cmake "
	"from there, passing the path to this source directory as the last argument. "
	"This process created the file `CMakeCache.txt' and the directory "
	"`CMakeFiles'. Please delete them.")
	endif()

	list (APPEND CMAKE_MODULE_PATH "${LLD_SOURCE_DIR}/cmake/modules")

	include(AddLLD)

	option(LLD_USE_VTUNE
	"Enable VTune user task tracking."
	OFF)
	if (LLD_USE_VTUNE)
	find_package(VTune)
	if (VTUNE_FOUND)
	include_directories(${VTune_INCLUDE_DIRS})
	list(APPEND LLVM_COMMON_LIBS ${VTune_LIBRARIES})
	add_definitions(-DLLD_HAS_VTUNE)
	endif()
	endif()

	option(LLD_BUILD_TOOLS
	"Build the lld tools. If OFF, just generate build targets." ON)

	if (MSVC)
	add_definitions(-wd4530) # Suppress 'warning C4530: C++ exception handler used, but unwind semantics are not enabled.'
	add_definitions(-wd4062) # Suppress 'warning C4062: enumerator X in switch of enum Y is not handled' from system header.
	endif()

	include_directories(BEFORE
	${CMAKE_CURRENT_BINARY_DIR}/include
	${CMAKE_CURRENT_SOURCE_DIR}/include
	)

	if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
	install(DIRECTORY include/
	DESTINATION include
	FILES_MATCHING
	PATTERN "*.h"
	PATTERN ".svn" EXCLUDE
	)
	endif()

	add_subdirectory(Common)
	add_subdirectory(lib)
	add_subdirectory(tools/lld)

	if (LLVM_INCLUDE_TESTS)
	add_subdirectory(test)
	add_subdirectory(unittests)
	endif()

	add_subdirectory(docs)
	add_subdirectory(COFF)
	add_subdirectory(ELF)
	add_subdirectory(MinGW)
	add_subdirectory(wasm)
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/Config.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/Config.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/Config.h (revision 351722)
	@@ -1,231 +1,232 @@
	//===- Config.h -------------------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLD_COFF_CONFIG_H
	#define LLD_COFF_CONFIG_H

	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/CachePruning.h"
	#include <cstdint>
	#include <map>
	#include <set>
	#include <string>

	namespace lld {
	namespace coff {

	using llvm::COFF::IMAGE_FILE_MACHINE_UNKNOWN;
	using llvm::COFF::WindowsSubsystem;
	using llvm::StringRef;
	class DefinedAbsolute;
	class DefinedRelative;
	class StringChunk;
	class Symbol;
	class InputFile;

	// Short aliases.
	static const auto AMD64 = llvm::COFF::IMAGE_FILE_MACHINE_AMD64;
	static const auto ARM64 = llvm::COFF::IMAGE_FILE_MACHINE_ARM64;
	static const auto ARMNT = llvm::COFF::IMAGE_FILE_MACHINE_ARMNT;
	static const auto I386 = llvm::COFF::IMAGE_FILE_MACHINE_I386;

	// Represents an /export option.
	struct Export {
	StringRef name; // N in /export:N or /export:E=N
	StringRef extName; // E in /export:E=N
	Symbol *sym = nullptr;
	uint16_t ordinal = 0;
	bool noname = false;
	bool data = false;
	bool isPrivate = false;
	bool constant = false;

	// If an export is a form of /export:foo=dllname.bar, that means
	// that foo should be exported as an alias to bar in the DLL.
	// forwardTo is set to "dllname.bar" part. Usually empty.
	StringRef forwardTo;
	StringChunk *forwardChunk = nullptr;

	// True if this /export option was in .drectves section.
	bool directives = false;
	StringRef symbolName;
	StringRef exportName; // Name in DLL

	bool operator==(const Export &e) {
	return (name == e.name && extName == e.extName &&
	ordinal == e.ordinal && noname == e.noname &&
	data == e.data && isPrivate == e.isPrivate);
	}
	};

	enum class DebugType {
	None = 0x0,
	CV = 0x1, /// CodeView
	PData = 0x2, /// Procedure Data
	Fixup = 0x4, /// Relocation Table
	};

	enum class GuardCFLevel {
	Off,
	NoLongJmp, // Emit gfids but no longjmp tables
	Full, // Enable all protections.
	};

	// Global configuration.
	struct Configuration {
	enum ManifestKind { SideBySide, Embed, No };
	bool is64() { return machine == AMD64 \|\| machine == ARM64; }

	llvm::COFF::MachineTypes machine = IMAGE_FILE_MACHINE_UNKNOWN;
	size_t wordsize;
	bool verbose = false;
	WindowsSubsystem subsystem = llvm::COFF::IMAGE_SUBSYSTEM_UNKNOWN;
	Symbol *entry = nullptr;
	bool noEntry = false;
	std::string outputFile;
	std::string importName;
	bool demangle = true;
	bool doGC = true;
	bool doICF = true;
	bool tailMerge;
	bool relocatable = true;
	bool forceMultiple = false;
	bool forceMultipleRes = false;
	bool forceUnresolved = false;
	bool debug = false;
	bool debugDwarf = false;
	bool debugGHashes = false;
	bool debugSymtab = false;
	bool showTiming = false;
	bool showSummary = false;
	unsigned debugTypes = static_cast<unsigned>(DebugType::None);
	std::vector<std::string> natvisFiles;
	llvm::SmallString<128> pdbAltPath;
	llvm::SmallString<128> pdbPath;
	llvm::SmallString<128> pdbSourcePath;
	std::vector<llvm::StringRef> argv;

	// Symbols in this set are considered as live by the garbage collector.
	std::vector<Symbol *> gcroot;

	std::set<std::string> noDefaultLibs;
	bool noDefaultLibAll = false;

	// True if we are creating a DLL.
	bool dll = false;
	StringRef implib;
	std::vector<Export> exports;
	std::set<std::string> delayLoads;
	std::map<std::string, int> dllOrder;
	Symbol *delayLoadHelper = nullptr;

	bool saveTemps = false;

	// /guard:cf
	GuardCFLevel guardCF = GuardCFLevel::Off;

	// Used for SafeSEH.
	bool safeSEH = false;
	Symbol *sehTable = nullptr;
	Symbol *sehCount = nullptr;

	// Used for /opt:lldlto=N
	unsigned ltoo = 2;

	// Used for /opt:lldltojobs=N
	unsigned thinLTOJobs = 0;
	// Used for /opt:lldltopartitions=N
	unsigned ltoPartitions = 1;

	// Used for /opt:lldltocache=path
	StringRef ltoCache;
	// Used for /opt:lldltocachepolicy=policy
	llvm::CachePruningPolicy ltoCachePolicy;

	// Used for /merge:from=to (e.g. /merge:.rdata=.text)
	std::map<StringRef, StringRef> merge;

	// Used for /section=.name,{DEKPRSW} to set section attributes.
	std::map<StringRef, uint32_t> section;

	// Options for manifest files.
	ManifestKind manifest = No;
	int manifestID = 1;
	StringRef manifestDependency;
	bool manifestUAC = true;
	std::vector<std::string> manifestInput;
	StringRef manifestLevel = "'asInvoker'";
	StringRef manifestUIAccess = "'false'";
	StringRef manifestFile;

	// Used for /aligncomm.
	std::map<std::string, int> alignComm;

	// Used for /failifmismatch.
	std::map<StringRef, std::pair<StringRef, InputFile *>> mustMatch;

	// Used for /alternatename.
	std::map<StringRef, StringRef> alternateNames;

	// Used for /order.
	llvm::StringMap<int> order;

	// Used for /lldmap.
	std::string mapFile;

	// Used for /thinlto-index-only:
	llvm::StringRef thinLTOIndexOnlyArg;

	// Used for /thinlto-object-prefix-replace:
	std::pair<llvm::StringRef, llvm::StringRef> thinLTOPrefixReplace;

	// Used for /thinlto-object-suffix-replace:
	std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;

	+ uint64_t align = 4096;
	uint64_t imageBase = -1;
	uint64_t fileAlign = 512;
	uint64_t stackReserve = 1024 * 1024;
	uint64_t stackCommit = 4096;
	uint64_t heapReserve = 1024 * 1024;
	uint64_t heapCommit = 4096;
	uint32_t majorImageVersion = 0;
	uint32_t minorImageVersion = 0;
	uint32_t majorOSVersion = 6;
	uint32_t minorOSVersion = 0;
	uint32_t timestamp = 0;
	uint32_t functionPadMin = 0;
	bool dynamicBase = true;
	bool allowBind = true;
	bool nxCompat = true;
	bool allowIsolation = true;
	bool terminalServerAware = true;
	bool largeAddressAware = false;
	bool highEntropyVA = false;
	bool appContainer = false;
	bool mingw = false;
	bool warnMissingOrderSymbol = true;
	bool warnLocallyDefinedImported = true;
	bool warnDebugInfoUnusable = true;
	bool incremental = true;
	bool integrityCheck = false;
	bool killAt = false;
	bool repro = false;
	bool swaprunCD = false;
	bool swaprunNet = false;
	bool thinLTOEmitImportsFiles;
	bool thinLTOIndexOnly;
	};

	extern Configuration *config;

	} // namespace coff
	} // namespace lld

	#endif
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/Driver.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/Driver.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/Driver.cpp (revision 351722)
	@@ -1,1893 +1,1917 @@
	//===- Driver.cpp ---------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "Driver.h"
	#include "Config.h"
	#include "DebugTypes.h"
	#include "ICF.h"
	#include "InputFiles.h"
	#include "MarkLive.h"
	#include "MinGW.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "Writer.h"
	#include "lld/Common/Args.h"
	#include "lld/Common/Driver.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Filesystem.h"
	#include "lld/Common/Memory.h"
	#include "lld/Common/Threads.h"
	#include "lld/Common/Timer.h"
	#include "lld/Common/Version.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/BinaryFormat/Magic.h"
	#include "llvm/Object/ArchiveWriter.h"
	#include "llvm/Object/COFFImportFile.h"
	#include "llvm/Object/COFFModuleDefinition.h"
	#include "llvm/Object/WindowsMachineFlag.h"
	#include "llvm/Option/Arg.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Option/Option.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/LEB128.h"
	+#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/Process.h"
	#include "llvm/Support/TarWriter.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
	#include <algorithm>
	#include <future>
	#include <memory>

	using namespace llvm;
	using namespace llvm::object;
	using namespace llvm::COFF;
	using llvm::sys::Process;

	namespace lld {
	namespace coff {

	static Timer inputFileTimer("Input File Reading", Timer::root());

	Configuration *config;
	LinkerDriver *driver;

	bool link(ArrayRef<const char *> args, bool canExitEarly, raw_ostream &diag) {
	errorHandler().logName = args::getFilenameWithoutExe(args[0]);
	errorHandler().errorOS = &diag;
	errorHandler().colorDiagnostics = diag.has_colors();
	errorHandler().errorLimitExceededMsg =
	"too many errors emitted, stopping now"
	" (use /errorlimit:0 to see all errors)";
	errorHandler().exitEarly = canExitEarly;
	config = make<Configuration>();

	symtab = make<SymbolTable>();

	driver = make<LinkerDriver>();
	driver->link(args);

	// Call exit() if we can to avoid calling destructors.
	if (canExitEarly)
	exitLld(errorCount() ? 1 : 0);

	freeArena();
	ObjFile::instances.clear();
	ImportFile::instances.clear();
	BitcodeFile::instances.clear();
	memset(MergeChunk::instances, 0, sizeof(MergeChunk::instances));
	return !errorCount();
	}

	// Parse options of the form "old;new".
	static std::pair<StringRef, StringRef> getOldNewOptions(opt::InputArgList &args,
	unsigned id) {
	auto *arg = args.getLastArg(id);
	if (!arg)
	return {"", ""};

	StringRef s = arg->getValue();
	std::pair<StringRef, StringRef> ret = s.split(';');
	if (ret.second.empty())
	error(arg->getSpelling() + " expects 'old;new' format, but got " + s);
	return ret;
	}

	// Drop directory components and replace extension with ".exe" or ".dll".
	static std::string getOutputPath(StringRef path) {
	auto p = path.find_last_of("\\/");
	StringRef s = (p == StringRef::npos) ? path : path.substr(p + 1);
	const char* e = config->dll ? ".dll" : ".exe";
	return (s.substr(0, s.rfind('.')) + e).str();
	}

	// Returns true if S matches /crtend.?\.o$/.
	static bool isCrtend(StringRef s) {
	if (!s.endswith(".o"))
	return false;
	s = s.drop_back(2);
	if (s.endswith("crtend"))
	return true;
	return !s.empty() && s.drop_back().endswith("crtend");
	}

	// ErrorOr is not default constructible, so it cannot be used as the type
	// parameter of a future.
	// FIXME: We could open the file in createFutureForFile and avoid needing to
	// return an error here, but for the moment that would cost us a file descriptor
	// (a limited resource on Windows) for the duration that the future is pending.
	using MBErrPair = std::pair<std::unique_ptr<MemoryBuffer>, std::error_code>;

	// Create a std::future that opens and maps a file using the best strategy for
	// the host platform.
	static std::future<MBErrPair> createFutureForFile(std::string path) {
	#if _WIN32
	// On Windows, file I/O is relatively slow so it is best to do this
	// asynchronously.
	auto strategy = std::launch::async;
	#else
	auto strategy = std::launch::deferred;
	#endif
	return std::async(strategy, [=]() {
	auto mbOrErr = MemoryBuffer::getFile(path,
	/FileSize/ -1,
	/RequiresNullTerminator/ false);
	if (!mbOrErr)
	return MBErrPair{nullptr, mbOrErr.getError()};
	return MBErrPair{std::move(*mbOrErr), std::error_code()};
	});
	}

	// Symbol names are mangled by prepending "_" on x86.
	static StringRef mangle(StringRef sym) {
	assert(config->machine != IMAGE_FILE_MACHINE_UNKNOWN);
	if (config->machine == I386)
	return saver.save("_" + sym);
	return sym;
	}

	static bool findUnderscoreMangle(StringRef sym) {
	Symbol *s = symtab->findMangle(mangle(sym));
	return s && !isa<Undefined>(s);
	}

	MemoryBufferRef LinkerDriver::takeBuffer(std::unique_ptr<MemoryBuffer> mb) {
	MemoryBufferRef mbref = *mb;
	make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take ownership

	if (driver->tar)
	driver->tar->append(relativeToRoot(mbref.getBufferIdentifier()),
	mbref.getBuffer());
	return mbref;
	}

	void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
	bool wholeArchive) {
	StringRef filename = mb->getBufferIdentifier();

	MemoryBufferRef mbref = takeBuffer(std::move(mb));
	filePaths.push_back(filename);

	// File type is detected by contents, not by file extension.
	switch (identify_magic(mbref.getBuffer())) {
	case file_magic::windows_resource:
	resources.push_back(mbref);
	break;
	case file_magic::archive:
	if (wholeArchive) {
	std::unique_ptr<Archive> file =
	CHECK(Archive::create(mbref), filename + ": failed to parse archive");
	Archive *archive = file.get();
	make<std::unique_ptr<Archive>>(std::move(file)); // take ownership

	for (MemoryBufferRef m : getArchiveMembers(archive))
	addArchiveBuffer(m, "<whole-archive>", filename, 0);
	return;
	}
	symtab->addFile(make<ArchiveFile>(mbref));
	break;
	case file_magic::bitcode:
	symtab->addFile(make<BitcodeFile>(mbref, "", 0));
	break;
	case file_magic::coff_object:
	case file_magic::coff_import_library:
	symtab->addFile(make<ObjFile>(mbref));
	break;
	case file_magic::pdb:
	loadTypeServerSource(mbref);
	break;
	case file_magic::coff_cl_gl_object:
	error(filename + ": is not a native COFF file. Recompile without /GL");
	break;
	case file_magic::pecoff_executable:
	if (filename.endswith_lower(".dll")) {
	error(filename + ": bad file type. Did you specify a DLL instead of an "
	"import library?");
	break;
	}
	LLVM_FALLTHROUGH;
	default:
	error(mbref.getBufferIdentifier() + ": unknown file type");
	break;
	}
	}

	void LinkerDriver::enqueuePath(StringRef path, bool wholeArchive) {
	auto future =
	std::make_shared<std::future<MBErrPair>>(createFutureForFile(path));
	std::string pathStr = path;
	enqueueTask([=]() {
	auto mbOrErr = future->get();
	if (mbOrErr.second) {
	std::string msg =
	"could not open '" + pathStr + "': " + mbOrErr.second.message();
	// Check if the filename is a typo for an option flag. OptTable thinks
	// that all args that are not known options and that start with / are
	// filenames, but e.g. `/nodefaultlibs` is more likely a typo for
	// the option `/nodefaultlib` than a reference to a file in the root
	// directory.
	std::string nearest;
	if (COFFOptTable().findNearest(pathStr, nearest) > 1)
	error(msg);
	else
	error(msg + "; did you mean '" + nearest + "'");
	} else
	driver->addBuffer(std::move(mbOrErr.first), wholeArchive);
	});
	}

	void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
	StringRef parentName,
	uint64_t offsetInArchive) {
	file_magic magic = identify_magic(mb.getBuffer());
	if (magic == file_magic::coff_import_library) {
	InputFile *imp = make<ImportFile>(mb);
	imp->parentName = parentName;
	symtab->addFile(imp);
	return;
	}

	InputFile *obj;
	if (magic == file_magic::coff_object) {
	obj = make<ObjFile>(mb);
	} else if (magic == file_magic::bitcode) {
	obj = make<BitcodeFile>(mb, parentName, offsetInArchive);
	} else {
	error("unknown file type: " + mb.getBufferIdentifier());
	return;
	}

	obj->parentName = parentName;
	symtab->addFile(obj);
	log("Loaded " + toString(obj) + " for " + symName);
	}

	void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
	- StringRef symName,
	+ const Archive::Symbol &sym,
	StringRef parentName) {

	- auto reportBufferError = [=](Error &&e,
	- StringRef childName) {
	+ auto reportBufferError = [=](Error &&e, StringRef childName) {
	fatal("could not get the buffer for the member defining symbol " +
	- symName + ": " + parentName + "(" + childName + "): " +
	+ toCOFFString(sym) + ": " + parentName + "(" + childName + "): " +
	toString(std::move(e)));
	};

	if (!c.getParent()->isThin()) {
	uint64_t offsetInArchive = c.getChildOffset();
	Expected<MemoryBufferRef> mbOrErr = c.getMemoryBufferRef();
	if (!mbOrErr)
	reportBufferError(mbOrErr.takeError(), check(c.getFullName()));
	MemoryBufferRef mb = mbOrErr.get();
	enqueueTask([=]() {
	- driver->addArchiveBuffer(mb, symName, parentName, offsetInArchive);
	+ driver->addArchiveBuffer(mb, toCOFFString(sym), parentName,
	+ offsetInArchive);
	});
	return;
	}

	std::string childName = CHECK(
	c.getFullName(),
	"could not get the filename for the member defining symbol " +
	- symName);
	+ toCOFFString(sym));
	auto future = std::make_shared<std::future<MBErrPair>>(
	createFutureForFile(childName));
	enqueueTask([=]() {
	auto mbOrErr = future->get();
	if (mbOrErr.second)
	reportBufferError(errorCodeToError(mbOrErr.second), childName);
	- driver->addArchiveBuffer(takeBuffer(std::move(mbOrErr.first)), symName,
	- parentName, /* OffsetInArchive */ 0);
	+ driver->addArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
	+ toCOFFString(sym), parentName,
	+ /OffsetInArchive=/0);
	});
	}

	static bool isDecorated(StringRef sym) {
	return sym.startswith("@") \|\| sym.contains("@@") \|\| sym.startswith("?") \|\|
	(!config->mingw && sym.contains('@'));
	}

	// Parses .drectve section contents and returns a list of files
	// specified by /defaultlib.
	void LinkerDriver::parseDirectives(InputFile *file) {
	StringRef s = file->getDirectives();
	if (s.empty())
	return;

	log("Directives: " + toString(file) + ": " + s);

	ArgParser parser;
	// .drectve is always tokenized using Windows shell rules.
	// /EXPORT: option can appear too many times, processing in fastpath.
	opt::InputArgList args;
	std::vector<StringRef> exports;
	std::tie(args, exports) = parser.parseDirectives(s);

	for (StringRef e : exports) {
	// If a common header file contains dllexported function
	// declarations, many object files may end up with having the
	// same /EXPORT options. In order to save cost of parsing them,
	// we dedup them first.
	if (!directivesExports.insert(e).second)
	continue;

	Export exp = parseExport(e);
	if (config->machine == I386 && config->mingw) {
	if (!isDecorated(exp.name))
	exp.name = saver.save("_" + exp.name);
	if (!exp.extName.empty() && !isDecorated(exp.extName))
	exp.extName = saver.save("_" + exp.extName);
	}
	exp.directives = true;
	config->exports.push_back(exp);
	}

	for (auto *arg : args) {
	switch (arg->getOption().getID()) {
	case OPT_aligncomm:
	parseAligncomm(arg->getValue());
	break;
	case OPT_alternatename:
	parseAlternateName(arg->getValue());
	break;
	case OPT_defaultlib:
	if (Optional<StringRef> path = findLib(arg->getValue()))
	enqueuePath(*path, false);
	break;
	case OPT_entry:
	config->entry = addUndefined(mangle(arg->getValue()));
	break;
	case OPT_failifmismatch:
	checkFailIfMismatch(arg->getValue(), file);
	break;
	case OPT_incl:
	addUndefined(arg->getValue());
	break;
	case OPT_merge:
	parseMerge(arg->getValue());
	break;
	case OPT_nodefaultlib:
	config->noDefaultLibs.insert(doFindLib(arg->getValue()).lower());
	break;
	case OPT_section:
	parseSection(arg->getValue());
	break;
	case OPT_subsystem:
	parseSubsystem(arg->getValue(), &config->subsystem,
	&config->majorOSVersion, &config->minorOSVersion);
	break;
	// Only add flags here that link.exe accepts in
	// `#pragma comment(linker, "/flag")`-generated sections.
	case OPT_editandcontinue:
	case OPT_guardsym:
	case OPT_throwingnew:
	break;
	default:
	error(arg->getSpelling() + " is not allowed in .drectve");
	}
	}
	}

	// Find file from search paths. You can omit ".obj", this function takes
	// care of that. Note that the returned path is not guaranteed to exist.
	StringRef LinkerDriver::doFindFile(StringRef filename) {
	bool hasPathSep = (filename.find_first_of("/\\") != StringRef::npos);
	if (hasPathSep)
	return filename;
	bool hasExt = filename.contains('.');
	for (StringRef dir : searchPaths) {
	SmallString<128> path = dir;
	sys::path::append(path, filename);
	if (sys::fs::exists(path.str()))
	return saver.save(path.str());
	if (!hasExt) {
	path.append(".obj");
	if (sys::fs::exists(path.str()))
	return saver.save(path.str());
	}
	}
	return filename;
	}

	static Optional<sys::fs::UniqueID> getUniqueID(StringRef path) {
	sys::fs::UniqueID ret;
	if (sys::fs::getUniqueID(path, ret))
	return None;
	return ret;
	}

	// Resolves a file path. This never returns the same path
	// (in that case, it returns None).
	Optional<StringRef> LinkerDriver::findFile(StringRef filename) {
	StringRef path = doFindFile(filename);

	if (Optional<sys::fs::UniqueID> id = getUniqueID(path)) {
	bool seen = !visitedFiles.insert(*id).second;
	if (seen)
	return None;
	}

	if (path.endswith_lower(".lib"))
	visitedLibs.insert(sys::path::filename(path));
	return path;
	}

	// MinGW specific. If an embedded directive specified to link to
	// foo.lib, but it isn't found, try libfoo.a instead.
	StringRef LinkerDriver::doFindLibMinGW(StringRef filename) {
	if (filename.contains('/') \|\| filename.contains('\\'))
	return filename;

	SmallString<128> s = filename;
	sys::path::replace_extension(s, ".a");
	StringRef libName = saver.save("lib" + s.str());
	return doFindFile(libName);
	}

	// Find library file from search path.
	StringRef LinkerDriver::doFindLib(StringRef filename) {
	// Add ".lib" to Filename if that has no file extension.
	bool hasExt = filename.contains('.');
	if (!hasExt)
	filename = saver.save(filename + ".lib");
	StringRef ret = doFindFile(filename);
	// For MinGW, if the find above didn't turn up anything, try
	// looking for a MinGW formatted library name.
	if (config->mingw && ret == filename)
	return doFindLibMinGW(filename);
	return ret;
	}

	// Resolves a library path. /nodefaultlib options are taken into
	// consideration. This never returns the same path (in that case,
	// it returns None).
	Optional<StringRef> LinkerDriver::findLib(StringRef filename) {
	if (config->noDefaultLibAll)
	return None;
	if (!visitedLibs.insert(filename.lower()).second)
	return None;

	StringRef path = doFindLib(filename);
	if (config->noDefaultLibs.count(path.lower()))
	return None;

	if (Optional<sys::fs::UniqueID> id = getUniqueID(path))
	if (!visitedFiles.insert(*id).second)
	return None;
	return path;
	}

	// Parses LIB environment which contains a list of search paths.
	void LinkerDriver::addLibSearchPaths() {
	Optional<std::string> envOpt = Process::GetEnv("LIB");
	if (!envOpt.hasValue())
	return;
	StringRef env = saver.save(*envOpt);
	while (!env.empty()) {
	StringRef path;
	std::tie(path, env) = env.split(';');
	searchPaths.push_back(path);
	}
	}

	Symbol *LinkerDriver::addUndefined(StringRef name) {
	Symbol *b = symtab->addUndefined(name);
	if (!b->isGCRoot) {
	b->isGCRoot = true;
	config->gcroot.push_back(b);
	}
	return b;
	}

	StringRef LinkerDriver::mangleMaybe(Symbol *s) {
	// If the plain symbol name has already been resolved, do nothing.
	Undefined *unmangled = dyn_cast<Undefined>(s);
	if (!unmangled)
	return "";

	// Otherwise, see if a similar, mangled symbol exists in the symbol table.
	Symbol *mangled = symtab->findMangle(unmangled->getName());
	if (!mangled)
	return "";

	// If we find a similar mangled symbol, make this an alias to it and return
	// its name.
	log(unmangled->getName() + " aliased to " + mangled->getName());
	unmangled->weakAlias = symtab->addUndefined(mangled->getName());
	return mangled->getName();
	}

	// Windows specific -- find default entry point name.
	//
	// There are four different entry point functions for Windows executables,
	// each of which corresponds to a user-defined "main" function. This function
	// infers an entry point from a user-defined "main" function.
	StringRef LinkerDriver::findDefaultEntry() {
	assert(config->subsystem != IMAGE_SUBSYSTEM_UNKNOWN &&
	"must handle /subsystem before calling this");

	if (config->mingw)
	return mangle(config->subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI
	? "WinMainCRTStartup"
	: "mainCRTStartup");

	if (config->subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI) {
	if (findUnderscoreMangle("wWinMain")) {
	if (!findUnderscoreMangle("WinMain"))
	return mangle("wWinMainCRTStartup");
	warn("found both wWinMain and WinMain; using latter");
	}
	return mangle("WinMainCRTStartup");
	}
	if (findUnderscoreMangle("wmain")) {
	if (!findUnderscoreMangle("main"))
	return mangle("wmainCRTStartup");
	warn("found both wmain and main; using latter");
	}
	return mangle("mainCRTStartup");
	}

	WindowsSubsystem LinkerDriver::inferSubsystem() {
	if (config->dll)
	return IMAGE_SUBSYSTEM_WINDOWS_GUI;
	if (config->mingw)
	return IMAGE_SUBSYSTEM_WINDOWS_CUI;
	// Note that link.exe infers the subsystem from the presence of these
	// functions even if /entry: or /nodefaultlib are passed which causes them
	// to not be called.
	bool haveMain = findUnderscoreMangle("main");
	bool haveWMain = findUnderscoreMangle("wmain");
	bool haveWinMain = findUnderscoreMangle("WinMain");
	bool haveWWinMain = findUnderscoreMangle("wWinMain");
	if (haveMain \|\| haveWMain) {
	if (haveWinMain \|\| haveWWinMain) {
	warn(std::string("found ") + (haveMain ? "main" : "wmain") + " and " +
	(haveWinMain ? "WinMain" : "wWinMain") +
	"; defaulting to /subsystem:console");
	}
	return IMAGE_SUBSYSTEM_WINDOWS_CUI;
	}
	if (haveWinMain \|\| haveWWinMain)
	return IMAGE_SUBSYSTEM_WINDOWS_GUI;
	return IMAGE_SUBSYSTEM_UNKNOWN;
	}

	static uint64_t getDefaultImageBase() {
	if (config->is64())
	return config->dll ? 0x180000000 : 0x140000000;
	return config->dll ? 0x10000000 : 0x400000;
	}

	static std::string createResponseFile(const opt::InputArgList &args,
	ArrayRef<StringRef> filePaths,
	ArrayRef<StringRef> searchPaths) {
	SmallString<0> data;
	raw_svector_ostream os(data);

	for (auto *arg : args) {
	switch (arg->getOption().getID()) {
	case OPT_linkrepro:
	case OPT_INPUT:
	case OPT_defaultlib:
	case OPT_libpath:
	case OPT_manifest:
	case OPT_manifest_colon:
	case OPT_manifestdependency:
	case OPT_manifestfile:
	case OPT_manifestinput:
	case OPT_manifestuac:
	break;
	case OPT_implib:
	case OPT_pdb:
	case OPT_out:
	os << arg->getSpelling() << sys::path::filename(arg->getValue()) << "\n";
	break;
	default:
	os << toString(*arg) << "\n";
	}
	}

	for (StringRef path : searchPaths) {
	std::string relPath = relativeToRoot(path);
	os << "/libpath:" << quote(relPath) << "\n";
	}

	for (StringRef path : filePaths)
	os << quote(relativeToRoot(path)) << "\n";

	return data.str();
	}

	enum class DebugKind { Unknown, None, Full, FastLink, GHash, Dwarf, Symtab };

	static DebugKind parseDebugKind(const opt::InputArgList &args) {
	auto *a = args.getLastArg(OPT_debug, OPT_debug_opt);
	if (!a)
	return DebugKind::None;
	if (a->getNumValues() == 0)
	return DebugKind::Full;

	DebugKind debug = StringSwitch<DebugKind>(a->getValue())
	.CaseLower("none", DebugKind::None)
	.CaseLower("full", DebugKind::Full)
	.CaseLower("fastlink", DebugKind::FastLink)
	// LLD extensions
	.CaseLower("ghash", DebugKind::GHash)
	.CaseLower("dwarf", DebugKind::Dwarf)
	.CaseLower("symtab", DebugKind::Symtab)
	.Default(DebugKind::Unknown);

	if (debug == DebugKind::FastLink) {
	warn("/debug:fastlink unsupported; using /debug:full");
	return DebugKind::Full;
	}
	if (debug == DebugKind::Unknown) {
	error("/debug: unknown option: " + Twine(a->getValue()));
	return DebugKind::None;
	}
	return debug;
	}

	static unsigned parseDebugTypes(const opt::InputArgList &args) {
	unsigned debugTypes = static_cast<unsigned>(DebugType::None);

	if (auto *a = args.getLastArg(OPT_debugtype)) {
	SmallVector<StringRef, 3> types;
	StringRef(a->getValue())
	.split(types, ',', /MaxSplit=/-1, /KeepEmpty=/false);

	for (StringRef type : types) {
	unsigned v = StringSwitch<unsigned>(type.lower())
	.Case("cv", static_cast<unsigned>(DebugType::CV))
	.Case("pdata", static_cast<unsigned>(DebugType::PData))
	.Case("fixup", static_cast<unsigned>(DebugType::Fixup))
	.Default(0);
	if (v == 0) {
	warn("/debugtype: unknown option '" + type + "'");
	continue;
	}
	debugTypes \|= v;
	}
	return debugTypes;
	}

	// Default debug types
	debugTypes = static_cast<unsigned>(DebugType::CV);
	if (args.hasArg(OPT_driver))
	debugTypes \|= static_cast<unsigned>(DebugType::PData);
	if (args.hasArg(OPT_profile))
	debugTypes \|= static_cast<unsigned>(DebugType::Fixup);

	return debugTypes;
	}

	static std::string getMapFile(const opt::InputArgList &args) {
	auto *arg = args.getLastArg(OPT_lldmap, OPT_lldmap_file);
	if (!arg)
	return "";
	if (arg->getOption().getID() == OPT_lldmap_file)
	return arg->getValue();

	assert(arg->getOption().getID() == OPT_lldmap);
	StringRef outFile = config->outputFile;
	return (outFile.substr(0, outFile.rfind('.')) + ".map").str();
	}

	static std::string getImplibPath() {
	if (!config->implib.empty())
	return config->implib;
	SmallString<128> out = StringRef(config->outputFile);
	sys::path::replace_extension(out, ".lib");
	return out.str();
	}

	//
	// The import name is caculated as the following:
	//
	// \| LIBRARY w/ ext \| LIBRARY w/o ext \| no LIBRARY
	// -----+----------------+---------------------+------------------
	// LINK \| {value} \| {value}.{.dll/.exe} \| {output name}
	// LIB \| {value} \| {value}.dll \| {output name}.dll
	//
	static std::string getImportName(bool asLib) {
	SmallString<128> out;

	if (config->importName.empty()) {
	out.assign(sys::path::filename(config->outputFile));
	if (asLib)
	sys::path::replace_extension(out, ".dll");
	} else {
	out.assign(config->importName);
	if (!sys::path::has_extension(out))
	sys::path::replace_extension(out,
	(config->dll \|\| asLib) ? ".dll" : ".exe");
	}

	return out.str();
	}

	static void createImportLibrary(bool asLib) {
	std::vector<COFFShortExport> exports;
	for (Export &e1 : config->exports) {
	COFFShortExport e2;
	e2.Name = e1.name;
	e2.SymbolName = e1.symbolName;
	e2.ExtName = e1.extName;
	e2.Ordinal = e1.ordinal;
	e2.Noname = e1.noname;
	e2.Data = e1.data;
	e2.Private = e1.isPrivate;
	e2.Constant = e1.constant;
	exports.push_back(e2);
	}

	auto handleError = [](Error &&e) {
	handleAllErrors(std::move(e),
	[](ErrorInfoBase &eib) { error(eib.message()); });
	};
	std::string libName = getImportName(asLib);
	std::string path = getImplibPath();

	if (!config->incremental) {
	handleError(writeImportLibrary(libName, path, exports, config->machine,
	config->mingw));
	return;
	}

	// If the import library already exists, replace it only if the contents
	// have changed.
	ErrorOr<std::unique_ptr<MemoryBuffer>> oldBuf = MemoryBuffer::getFile(
	path, /FileSize/ -1, /RequiresNullTerminator/ false);
	if (!oldBuf) {
	handleError(writeImportLibrary(libName, path, exports, config->machine,
	config->mingw));
	return;
	}

	SmallString<128> tmpName;
	if (std::error_code ec =
	sys::fs::createUniqueFile(path + ".tmp-%%%%%%%%.lib", tmpName))
	fatal("cannot create temporary file for import library " + path + ": " +
	ec.message());

	if (Error e = writeImportLibrary(libName, tmpName, exports, config->machine,
	config->mingw)) {
	handleError(std::move(e));
	return;
	}

	std::unique_ptr<MemoryBuffer> newBuf = check(MemoryBuffer::getFile(
	tmpName, /FileSize/ -1, /RequiresNullTerminator/ false));
	if ((*oldBuf)->getBuffer() != newBuf->getBuffer()) {
	oldBuf->reset();
	handleError(errorCodeToError(sys::fs::rename(tmpName, path)));
	} else {
	sys::fs::remove(tmpName);
	}
	}

	static void parseModuleDefs(StringRef path) {
	std::unique_ptr<MemoryBuffer> mb = CHECK(
	MemoryBuffer::getFile(path, -1, false, true), "could not open " + path);
	COFFModuleDefinition m = check(parseCOFFModuleDefinition(
	mb->getMemBufferRef(), config->machine, config->mingw));

	if (config->outputFile.empty())
	config->outputFile = saver.save(m.OutputFile);
	config->importName = saver.save(m.ImportName);
	if (m.ImageBase)
	config->imageBase = m.ImageBase;
	if (m.StackReserve)
	config->stackReserve = m.StackReserve;
	if (m.StackCommit)
	config->stackCommit = m.StackCommit;
	if (m.HeapReserve)
	config->heapReserve = m.HeapReserve;
	if (m.HeapCommit)
	config->heapCommit = m.HeapCommit;
	if (m.MajorImageVersion)
	config->majorImageVersion = m.MajorImageVersion;
	if (m.MinorImageVersion)
	config->minorImageVersion = m.MinorImageVersion;
	if (m.MajorOSVersion)
	config->majorOSVersion = m.MajorOSVersion;
	if (m.MinorOSVersion)
	config->minorOSVersion = m.MinorOSVersion;

	for (COFFShortExport e1 : m.Exports) {
	Export e2;
	// In simple cases, only Name is set. Renamed exports are parsed
	// and set as "ExtName = Name". If Name has the form "OtherDll.Func",
	// it shouldn't be a normal exported function but a forward to another
	// DLL instead. This is supported by both MS and GNU linkers.
	if (e1.ExtName != e1.Name && StringRef(e1.Name).contains('.')) {
	e2.name = saver.save(e1.ExtName);
	e2.forwardTo = saver.save(e1.Name);
	config->exports.push_back(e2);
	continue;
	}
	e2.name = saver.save(e1.Name);
	e2.extName = saver.save(e1.ExtName);
	e2.ordinal = e1.Ordinal;
	e2.noname = e1.Noname;
	e2.data = e1.Data;
	e2.isPrivate = e1.Private;
	e2.constant = e1.Constant;
	config->exports.push_back(e2);
	}
	}

	void LinkerDriver::enqueueTask(std::function<void()> task) {
	taskQueue.push_back(std::move(task));
	}

	bool LinkerDriver::run() {
	ScopedTimer t(inputFileTimer);

	bool didWork = !taskQueue.empty();
	while (!taskQueue.empty()) {
	taskQueue.front()();
	taskQueue.pop_front();
	}
	return didWork;
	}

	// Parse an /order file. If an option is given, the linker places
	// COMDAT sections in the same order as their names appear in the
	// given file.
	static void parseOrderFile(StringRef arg) {
	// For some reason, the MSVC linker requires a filename to be
	// preceded by "@".
	if (!arg.startswith("@")) {
	error("malformed /order option: '@' missing");
	return;
	}

	// Get a list of all comdat sections for error checking.
	DenseSet<StringRef> set;
	for (Chunk *c : symtab->getChunks())
	if (auto *sec = dyn_cast<SectionChunk>(c))
	if (sec->sym)
	set.insert(sec->sym->getName());

	// Open a file.
	StringRef path = arg.substr(1);
	std::unique_ptr<MemoryBuffer> mb = CHECK(
	MemoryBuffer::getFile(path, -1, false, true), "could not open " + path);

	// Parse a file. An order file contains one symbol per line.
	// All symbols that were not present in a given order file are
	// considered to have the lowest priority 0 and are placed at
	// end of an output section.
	for (std::string s : args::getLines(mb->getMemBufferRef())) {
	if (config->machine == I386 && !isDecorated(s))
	s = "_" + s;

	if (set.count(s) == 0) {
	if (config->warnMissingOrderSymbol)
	warn("/order:" + arg + ": missing symbol: " + s + " [LNK4037]");
	}
	else
	config->order[s] = INT_MIN + config->order.size();
	}
	}

	static void markAddrsig(Symbol *s) {
	if (auto *d = dyn_cast_or_null<Defined>(s))
	if (SectionChunk *c = dyn_cast_or_null<SectionChunk>(d->getChunk()))
	c->keepUnique = true;
	}

	static void findKeepUniqueSections() {
	// Exported symbols could be address-significant in other executables or DSOs,
	// so we conservatively mark them as address-significant.
	for (Export &r : config->exports)
	markAddrsig(r.sym);

	// Visit the address-significance table in each object file and mark each
	// referenced symbol as address-significant.
	for (ObjFile *obj : ObjFile::instances) {
	ArrayRef<Symbol *> syms = obj->getSymbols();
	if (obj->addrsigSec) {
	ArrayRef<uint8_t> contents;
	cantFail(
	obj->getCOFFObj()->getSectionContents(obj->addrsigSec, contents));
	const uint8_t *cur = contents.begin();
	while (cur != contents.end()) {
	unsigned size;
	const char *err;
	uint64_t symIndex = decodeULEB128(cur, &size, contents.end(), &err);
	if (err)
	fatal(toString(obj) + ": could not decode addrsig section: " + err);
	if (symIndex >= syms.size())
	fatal(toString(obj) + ": invalid symbol index in addrsig section");
	markAddrsig(syms[symIndex]);
	cur += size;
	}
	} else {
	// If an object file does not have an address-significance table,
	// conservatively mark all of its symbols as address-significant.
	for (Symbol *s : syms)
	markAddrsig(s);
	}
	}
	}

	// link.exe replaces each %foo% in altPath with the contents of environment
	// variable foo, and adds the two magic env vars _PDB (expands to the basename
	// of pdb's output path) and _EXT (expands to the extension of the output
	// binary).
	// lld only supports %_PDB% and %_EXT% and warns on references to all other env
	// vars.
	static void parsePDBAltPath(StringRef altPath) {
	SmallString<128> buf;
	StringRef pdbBasename =
	sys::path::filename(config->pdbPath, sys::path::Style::windows);
	StringRef binaryExtension =
	sys::path::extension(config->outputFile, sys::path::Style::windows);
	if (!binaryExtension.empty())
	binaryExtension = binaryExtension.substr(1); // %_EXT% does not include '.'.

	// Invariant:
	// +--------- cursor ('a...' might be the empty string).
	// \| +----- firstMark
	// \| \| +- secondMark
	// v v v
	// a...%...%...
	size_t cursor = 0;
	while (cursor < altPath.size()) {
	size_t firstMark, secondMark;
	if ((firstMark = altPath.find('%', cursor)) == StringRef::npos \|\|
	(secondMark = altPath.find('%', firstMark + 1)) == StringRef::npos) {
	// Didn't find another full fragment, treat rest of string as literal.
	buf.append(altPath.substr(cursor));
	break;
	}

	// Found a full fragment. Append text in front of first %, and interpret
	// text between first and second % as variable name.
	buf.append(altPath.substr(cursor, firstMark - cursor));
	StringRef var = altPath.substr(firstMark, secondMark - firstMark + 1);
	if (var.equals_lower("%_pdb%"))
	buf.append(pdbBasename);
	else if (var.equals_lower("%_ext%"))
	buf.append(binaryExtension);
	else {
	warn("only %_PDB% and %_EXT% supported in /pdbaltpath:, keeping " +
	var + " as literal");
	buf.append(var);
	}

	cursor = secondMark + 1;
	}

	config->pdbAltPath = buf;
	}

	/// Check that at most one resource obj file was used.
	/// Call after ObjFile::Instances is complete.
	static void diagnoseMultipleResourceObjFiles() {
	// The .rsrc$01 section in a resource obj file contains a tree description
	// of resources. Merging multiple resource obj files would require merging
	// the trees instead of using usual linker section merging semantics.
	// Since link.exe disallows linking more than one resource obj file with
	// LNK4078, mirror that. The normal use of resource files is to give the
	// linker many .res files, which are then converted to a single resource obj
	// file internally, so this is not a big restriction in practice.
	ObjFile *resourceObjFile = nullptr;
	for (ObjFile *f : ObjFile::instances) {
	if (!f->isResourceObjFile)
	continue;

	if (!resourceObjFile) {
	resourceObjFile = f;
	continue;
	}

	error(toString(f) +
	": more than one resource obj file not allowed, already got " +
	toString(resourceObjFile));
	}
	}

	// In MinGW, if no symbols are chosen to be exported, then all symbols are
	// automatically exported by default. This behavior can be forced by the
	// -export-all-symbols option, so that it happens even when exports are
	// explicitly specified. The automatic behavior can be disabled using the
	// -exclude-all-symbols option, so that lld-link behaves like link.exe rather
	// than MinGW in the case that nothing is explicitly exported.
	void LinkerDriver::maybeExportMinGWSymbols(const opt::InputArgList &args) {
	if (!config->dll)
	return;

	if (!args.hasArg(OPT_export_all_symbols)) {
	if (!config->exports.empty())
	return;
	if (args.hasArg(OPT_exclude_all_symbols))
	return;
	}

	AutoExporter exporter;

	for (auto *arg : args.filtered(OPT_wholearchive_file))
	if (Optional<StringRef> path = doFindFile(arg->getValue()))
	exporter.addWholeArchive(*path);

	symtab->forEachSymbol([&](Symbol *s) {
	auto *def = dyn_cast<Defined>(s);
	if (!exporter.shouldExport(def))
	return;

	Export e;
	e.name = def->getName();
	e.sym = def;
	if (Chunk *c = def->getChunk())
	if (!(c->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE))
	e.data = true;
	config->exports.push_back(e);
	});
	}

	+static const char *libcallRoutineNames[] = {
	+#define HANDLE_LIBCALL(code, name) name,
	+#include "llvm/IR/RuntimeLibcalls.def"
	+#undef HANDLE_LIBCALL
	+};
	+
	void LinkerDriver::link(ArrayRef<const char *> argsArr) {
	// Needed for LTO.
	InitializeAllTargetInfos();
	InitializeAllTargets();
	InitializeAllTargetMCs();
	InitializeAllAsmParsers();
	InitializeAllAsmPrinters();

	// If the first command line argument is "/lib", link.exe acts like lib.exe.
	// We call our own implementation of lib.exe that understands bitcode files.
	if (argsArr.size() > 1 && StringRef(argsArr[1]).equals_lower("/lib")) {
	if (llvm::libDriverMain(argsArr.slice(1)) != 0)
	fatal("lib failed");
	return;
	}

	// Parse command line options.
	ArgParser parser;
	opt::InputArgList args = parser.parseLINK(argsArr);

	// Parse and evaluate -mllvm options.
	std::vector<const char *> v;
	v.push_back("lld-link (LLVM option parsing)");
	for (auto *arg : args.filtered(OPT_mllvm))
	v.push_back(arg->getValue());
	cl::ParseCommandLineOptions(v.size(), v.data());

	// Handle /errorlimit early, because error() depends on it.
	if (auto *arg = args.getLastArg(OPT_errorlimit)) {
	int n = 20;
	StringRef s = arg->getValue();
	if (s.getAsInteger(10, n))
	error(arg->getSpelling() + " number expected, but got " + s);
	errorHandler().errorLimit = n;
	}

	// Handle /help
	if (args.hasArg(OPT_help)) {
	printHelp(argsArr[0]);
	return;
	}

	lld::threadsEnabled = args.hasFlag(OPT_threads, OPT_threads_no, true);

	if (args.hasArg(OPT_show_timing))
	config->showTiming = true;

	config->showSummary = args.hasArg(OPT_summary);

	ScopedTimer t(Timer::root());
	// Handle --version, which is an lld extension. This option is a bit odd
	// because it doesn't start with "/", but we deliberately chose "--" to
	// avoid conflict with /version and for compatibility with clang-cl.
	if (args.hasArg(OPT_dash_dash_version)) {
	outs() << getLLDVersion() << "\n";
	return;
	}

	// Handle /lldmingw early, since it can potentially affect how other
	// options are handled.
	config->mingw = args.hasArg(OPT_lldmingw);

	if (auto *arg = args.getLastArg(OPT_linkrepro)) {
	SmallString<64> path = StringRef(arg->getValue());
	sys::path::append(path, "repro.tar");

	Expected<std::unique_ptr<TarWriter>> errOrWriter =
	TarWriter::create(path, "repro");

	if (errOrWriter) {
	tar = std::move(*errOrWriter);
	} else {
	error("/linkrepro: failed to open " + path + ": " +
	toString(errOrWriter.takeError()));
	}
	}

	if (!args.hasArg(OPT_INPUT)) {
	if (args.hasArg(OPT_deffile))
	config->noEntry = true;
	else
	fatal("no input files");
	}

	// Construct search path list.
	searchPaths.push_back("");
	for (auto *arg : args.filtered(OPT_libpath))
	searchPaths.push_back(arg->getValue());
	addLibSearchPaths();

	// Handle /ignore
	for (auto *arg : args.filtered(OPT_ignore)) {
	SmallVector<StringRef, 8> vec;
	StringRef(arg->getValue()).split(vec, ',');
	for (StringRef s : vec) {
	if (s == "4037")
	config->warnMissingOrderSymbol = false;
	else if (s == "4099")
	config->warnDebugInfoUnusable = false;
	else if (s == "4217")
	config->warnLocallyDefinedImported = false;
	// Other warning numbers are ignored.
	}
	}

	// Handle /out
	if (auto *arg = args.getLastArg(OPT_out))
	config->outputFile = arg->getValue();

	// Handle /verbose
	if (args.hasArg(OPT_verbose))
	config->verbose = true;
	errorHandler().verbose = config->verbose;

	// Handle /force or /force:unresolved
	if (args.hasArg(OPT_force, OPT_force_unresolved))
	config->forceUnresolved = true;

	// Handle /force or /force:multiple
	if (args.hasArg(OPT_force, OPT_force_multiple))
	config->forceMultiple = true;

	// Handle /force or /force:multipleres
	if (args.hasArg(OPT_force, OPT_force_multipleres))
	config->forceMultipleRes = true;

	// Handle /debug
	DebugKind debug = parseDebugKind(args);
	if (debug == DebugKind::Full \|\| debug == DebugKind::Dwarf \|\|
	debug == DebugKind::GHash) {
	config->debug = true;
	config->incremental = true;
	}

	// Handle /demangle
	config->demangle = args.hasFlag(OPT_demangle, OPT_demangle_no);

	// Handle /debugtype
	config->debugTypes = parseDebugTypes(args);

	// Handle /pdb
	bool shouldCreatePDB =
	(debug == DebugKind::Full \|\| debug == DebugKind::GHash);
	if (shouldCreatePDB) {
	if (auto *arg = args.getLastArg(OPT_pdb))
	config->pdbPath = arg->getValue();
	if (auto *arg = args.getLastArg(OPT_pdbaltpath))
	config->pdbAltPath = arg->getValue();
	if (args.hasArg(OPT_natvis))
	config->natvisFiles = args.getAllArgValues(OPT_natvis);

	if (auto *arg = args.getLastArg(OPT_pdb_source_path))
	config->pdbSourcePath = arg->getValue();
	}

	// Handle /noentry
	if (args.hasArg(OPT_noentry)) {
	if (args.hasArg(OPT_dll))
	config->noEntry = true;
	else
	error("/noentry must be specified with /dll");
	}

	// Handle /dll
	if (args.hasArg(OPT_dll)) {
	config->dll = true;
	config->manifestID = 2;
	}

	// Handle /dynamicbase and /fixed. We can't use hasFlag for /dynamicbase
	// because we need to explicitly check whether that option or its inverse was
	// present in the argument list in order to handle /fixed.
	auto *dynamicBaseArg = args.getLastArg(OPT_dynamicbase, OPT_dynamicbase_no);
	if (dynamicBaseArg &&
	dynamicBaseArg->getOption().getID() == OPT_dynamicbase_no)
	config->dynamicBase = false;

	// MSDN claims "/FIXED:NO is the default setting for a DLL, and /FIXED is the
	// default setting for any other project type.", but link.exe defaults to
	// /FIXED:NO for exe outputs as well. Match behavior, not docs.
	bool fixed = args.hasFlag(OPT_fixed, OPT_fixed_no, false);
	if (fixed) {
	if (dynamicBaseArg &&
	dynamicBaseArg->getOption().getID() == OPT_dynamicbase) {
	error("/fixed must not be specified with /dynamicbase");
	} else {
	config->relocatable = false;
	config->dynamicBase = false;
	}
	}

	// Handle /appcontainer
	config->appContainer =
	args.hasFlag(OPT_appcontainer, OPT_appcontainer_no, false);

	// Handle /machine
	if (auto *arg = args.getLastArg(OPT_machine)) {
	config->machine = getMachineType(arg->getValue());
	if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN)
	fatal(Twine("unknown /machine argument: ") + arg->getValue());
	}

	// Handle /nodefaultlib:<filename>
	for (auto *arg : args.filtered(OPT_nodefaultlib))
	config->noDefaultLibs.insert(doFindLib(arg->getValue()).lower());

	// Handle /nodefaultlib
	if (args.hasArg(OPT_nodefaultlib_all))
	config->noDefaultLibAll = true;

	// Handle /base
	if (auto *arg = args.getLastArg(OPT_base))
	parseNumbers(arg->getValue(), &config->imageBase);

	// Handle /filealign
	if (auto *arg = args.getLastArg(OPT_filealign)) {
	parseNumbers(arg->getValue(), &config->fileAlign);
	if (!isPowerOf2_64(config->fileAlign))
	error("/filealign: not a power of two: " + Twine(config->fileAlign));
	}

	// Handle /stack
	if (auto *arg = args.getLastArg(OPT_stack))
	parseNumbers(arg->getValue(), &config->stackReserve, &config->stackCommit);

	// Handle /guard:cf
	if (auto *arg = args.getLastArg(OPT_guard))
	parseGuard(arg->getValue());

	// Handle /heap
	if (auto *arg = args.getLastArg(OPT_heap))
	parseNumbers(arg->getValue(), &config->heapReserve, &config->heapCommit);

	// Handle /version
	if (auto *arg = args.getLastArg(OPT_version))
	parseVersion(arg->getValue(), &config->majorImageVersion,
	&config->minorImageVersion);

	// Handle /subsystem
	if (auto *arg = args.getLastArg(OPT_subsystem))
	parseSubsystem(arg->getValue(), &config->subsystem, &config->majorOSVersion,
	&config->minorOSVersion);

	// Handle /timestamp
	if (llvm::opt::Arg *arg = args.getLastArg(OPT_timestamp, OPT_repro)) {
	if (arg->getOption().getID() == OPT_repro) {
	config->timestamp = 0;
	config->repro = true;
	} else {
	config->repro = false;
	StringRef value(arg->getValue());
	if (value.getAsInteger(0, config->timestamp))
	fatal(Twine("invalid timestamp: ") + value +
	". Expected 32-bit integer");
	}
	} else {
	config->repro = false;
	config->timestamp = time(nullptr);
	}

	// Handle /alternatename
	for (auto *arg : args.filtered(OPT_alternatename))
	parseAlternateName(arg->getValue());

	// Handle /include
	for (auto *arg : args.filtered(OPT_incl))
	addUndefined(arg->getValue());

	// Handle /implib
	if (auto *arg = args.getLastArg(OPT_implib))
	config->implib = arg->getValue();

	// Handle /opt.
	bool doGC = debug == DebugKind::None \|\| args.hasArg(OPT_profile);
	unsigned icfLevel =
	args.hasArg(OPT_profile) ? 0 : 1; // 0: off, 1: limited, 2: on
	unsigned tailMerge = 1;
	for (auto *arg : args.filtered(OPT_opt)) {
	std::string str = StringRef(arg->getValue()).lower();
	SmallVector<StringRef, 1> vec;
	StringRef(str).split(vec, ',');
	for (StringRef s : vec) {
	if (s == "ref") {
	doGC = true;
	} else if (s == "noref") {
	doGC = false;
	} else if (s == "icf" \|\| s.startswith("icf=")) {
	icfLevel = 2;
	} else if (s == "noicf") {
	icfLevel = 0;
	} else if (s == "lldtailmerge") {
	tailMerge = 2;
	} else if (s == "nolldtailmerge") {
	tailMerge = 0;
	} else if (s.startswith("lldlto=")) {
	StringRef optLevel = s.substr(7);
	if (optLevel.getAsInteger(10, config->ltoo) \|\| config->ltoo > 3)
	error("/opt:lldlto: invalid optimization level: " + optLevel);
	} else if (s.startswith("lldltojobs=")) {
	StringRef jobs = s.substr(11);
	if (jobs.getAsInteger(10, config->thinLTOJobs) \|\|
	config->thinLTOJobs == 0)
	error("/opt:lldltojobs: invalid job count: " + jobs);
	} else if (s.startswith("lldltopartitions=")) {
	StringRef n = s.substr(17);
	if (n.getAsInteger(10, config->ltoPartitions) \|\|
	config->ltoPartitions == 0)
	error("/opt:lldltopartitions: invalid partition count: " + n);
	} else if (s != "lbr" && s != "nolbr")
	error("/opt: unknown option: " + s);
	}
	}

	// Limited ICF is enabled if GC is enabled and ICF was never mentioned
	// explicitly.
	// FIXME: LLD only implements "limited" ICF, i.e. it only merges identical
	// code. If the user passes /OPT:ICF explicitly, LLD should merge identical
	// comdat readonly data.
	if (icfLevel == 1 && !doGC)
	icfLevel = 0;
	config->doGC = doGC;
	config->doICF = icfLevel > 0;
	config->tailMerge = (tailMerge == 1 && config->doICF) \|\| tailMerge == 2;

	// Handle /lldsavetemps
	if (args.hasArg(OPT_lldsavetemps))
	config->saveTemps = true;

	// Handle /kill-at
	if (args.hasArg(OPT_kill_at))
	config->killAt = true;

	// Handle /lldltocache
	if (auto *arg = args.getLastArg(OPT_lldltocache))
	config->ltoCache = arg->getValue();

	// Handle /lldsavecachepolicy
	if (auto *arg = args.getLastArg(OPT_lldltocachepolicy))
	config->ltoCachePolicy = CHECK(
	parseCachePruningPolicy(arg->getValue()),
	Twine("/lldltocachepolicy: invalid cache policy: ") + arg->getValue());

	// Handle /failifmismatch
	for (auto *arg : args.filtered(OPT_failifmismatch))
	checkFailIfMismatch(arg->getValue(), nullptr);

	// Handle /merge
	for (auto *arg : args.filtered(OPT_merge))
	parseMerge(arg->getValue());

	// Add default section merging rules after user rules. User rules take
	// precedence, but we will emit a warning if there is a conflict.
	parseMerge(".idata=.rdata");
	parseMerge(".didat=.rdata");
	parseMerge(".edata=.rdata");
	parseMerge(".xdata=.rdata");
	parseMerge(".bss=.data");

	if (config->mingw) {
	parseMerge(".ctors=.rdata");
	parseMerge(".dtors=.rdata");
	parseMerge(".CRT=.rdata");
	}

	// Handle /section
	for (auto *arg : args.filtered(OPT_section))
	parseSection(arg->getValue());

	+ // Handle /align
	+ if (auto *arg = args.getLastArg(OPT_align)) {
	+ parseNumbers(arg->getValue(), &config->align);
	+ if (!isPowerOf2_64(config->align))
	+ error("/align: not a power of two: " + StringRef(arg->getValue()));
	+ }
	+
	// Handle /aligncomm
	for (auto *arg : args.filtered(OPT_aligncomm))
	parseAligncomm(arg->getValue());

	// Handle /manifestdependency. This enables /manifest unless /manifest:no is
	// also passed.
	if (auto *arg = args.getLastArg(OPT_manifestdependency)) {
	config->manifestDependency = arg->getValue();
	config->manifest = Configuration::SideBySide;
	}

	// Handle /manifest and /manifest:
	if (auto *arg = args.getLastArg(OPT_manifest, OPT_manifest_colon)) {
	if (arg->getOption().getID() == OPT_manifest)
	config->manifest = Configuration::SideBySide;
	else
	parseManifest(arg->getValue());
	}

	// Handle /manifestuac
	if (auto *arg = args.getLastArg(OPT_manifestuac))
	parseManifestUAC(arg->getValue());

	// Handle /manifestfile
	if (auto *arg = args.getLastArg(OPT_manifestfile))
	config->manifestFile = arg->getValue();

	// Handle /manifestinput
	for (auto *arg : args.filtered(OPT_manifestinput))
	config->manifestInput.push_back(arg->getValue());

	if (!config->manifestInput.empty() &&
	config->manifest != Configuration::Embed) {
	fatal("/manifestinput: requires /manifest:embed");
	}

	config->thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files);
	config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) \|\|
	args.hasArg(OPT_thinlto_index_only_arg);
	config->thinLTOIndexOnlyArg =
	args.getLastArgValue(OPT_thinlto_index_only_arg);
	config->thinLTOPrefixReplace =
	getOldNewOptions(args, OPT_thinlto_prefix_replace);
	config->thinLTOObjectSuffixReplace =
	getOldNewOptions(args, OPT_thinlto_object_suffix_replace);
	// Handle miscellaneous boolean flags.
	config->allowBind = args.hasFlag(OPT_allowbind, OPT_allowbind_no, true);
	config->allowIsolation =
	args.hasFlag(OPT_allowisolation, OPT_allowisolation_no, true);
	config->incremental =
	args.hasFlag(OPT_incremental, OPT_incremental_no,
	!config->doGC && !config->doICF && !args.hasArg(OPT_order) &&
	!args.hasArg(OPT_profile));
	config->integrityCheck =
	args.hasFlag(OPT_integritycheck, OPT_integritycheck_no, false);
	config->nxCompat = args.hasFlag(OPT_nxcompat, OPT_nxcompat_no, true);
	for (auto *arg : args.filtered(OPT_swaprun))
	parseSwaprun(arg->getValue());
	config->terminalServerAware =
	!config->dll && args.hasFlag(OPT_tsaware, OPT_tsaware_no, true);
	config->debugDwarf = debug == DebugKind::Dwarf;
	config->debugGHashes = debug == DebugKind::GHash;
	config->debugSymtab = debug == DebugKind::Symtab;

	config->mapFile = getMapFile(args);

	if (config->incremental && args.hasArg(OPT_profile)) {
	warn("ignoring '/incremental' due to '/profile' specification");
	config->incremental = false;
	}

	if (config->incremental && args.hasArg(OPT_order)) {
	warn("ignoring '/incremental' due to '/order' specification");
	config->incremental = false;
	}

	if (config->incremental && config->doGC) {
	warn("ignoring '/incremental' because REF is enabled; use '/opt:noref' to "
	"disable");
	config->incremental = false;
	}

	if (config->incremental && config->doICF) {
	warn("ignoring '/incremental' because ICF is enabled; use '/opt:noicf' to "
	"disable");
	config->incremental = false;
	}

	if (errorCount())
	return;

	std::set<sys::fs::UniqueID> wholeArchives;
	for (auto *arg : args.filtered(OPT_wholearchive_file))
	if (Optional<StringRef> path = doFindFile(arg->getValue()))
	if (Optional<sys::fs::UniqueID> id = getUniqueID(*path))
	wholeArchives.insert(*id);

	// A predicate returning true if a given path is an argument for
	// /wholearchive:, or /wholearchive is enabled globally.
	// This function is a bit tricky because "foo.obj /wholearchive:././foo.obj"
	// needs to be handled as "/wholearchive:foo.obj foo.obj".
	auto isWholeArchive = [&](StringRef path) -> bool {
	if (args.hasArg(OPT_wholearchive_flag))
	return true;
	if (Optional<sys::fs::UniqueID> id = getUniqueID(path))
	return wholeArchives.count(*id);
	return false;
	};

	// Create a list of input files. Files can be given as arguments
	// for /defaultlib option.
	for (auto *arg : args.filtered(OPT_INPUT, OPT_wholearchive_file))
	if (Optional<StringRef> path = findFile(arg->getValue()))
	enqueuePath(path, isWholeArchive(path));

	for (auto *arg : args.filtered(OPT_defaultlib))
	if (Optional<StringRef> path = findLib(arg->getValue()))
	enqueuePath(*path, false);

	// Windows specific -- Create a resource file containing a manifest file.
	if (config->manifest == Configuration::Embed)
	addBuffer(createManifestRes(), false);

	// Read all input files given via the command line.
	run();

	if (errorCount())
	return;

	// We should have inferred a machine type by now from the input files, but if
	// not we assume x64.
	if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN) {
	warn("/machine is not specified. x64 is assumed");
	config->machine = AMD64;
	}
	config->wordsize = config->is64() ? 8 : 4;

	// Handle /safeseh, x86 only, on by default, except for mingw.
	if (config->machine == I386 &&
	args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw))
	config->safeSEH = true;

	// Handle /functionpadmin
	for (auto *arg : args.filtered(OPT_functionpadmin, OPT_functionpadmin_opt))
	parseFunctionPadMin(arg, config->machine);

	// Input files can be Windows resource files (.res files). We use
	// WindowsResource to convert resource files to a regular COFF file,
	// then link the resulting file normally.
	if (!resources.empty())
	symtab->addFile(make<ObjFile>(convertResToCOFF(resources)));

	if (tar)
	tar->append("response.txt",
	createResponseFile(args, filePaths,
	ArrayRef<StringRef>(searchPaths).slice(1)));

	// Handle /largeaddressaware
	config->largeAddressAware = args.hasFlag(
	OPT_largeaddressaware, OPT_largeaddressaware_no, config->is64());

	// Handle /highentropyva
	config->highEntropyVA =
	config->is64() &&
	args.hasFlag(OPT_highentropyva, OPT_highentropyva_no, true);

	if (!config->dynamicBase &&
	(config->machine == ARMNT \|\| config->machine == ARM64))
	error("/dynamicbase:no is not compatible with " +
	machineToStr(config->machine));

	// Handle /export
	for (auto *arg : args.filtered(OPT_export)) {
	Export e = parseExport(arg->getValue());
	if (config->machine == I386) {
	if (!isDecorated(e.name))
	e.name = saver.save("_" + e.name);
	if (!e.extName.empty() && !isDecorated(e.extName))
	e.extName = saver.save("_" + e.extName);
	}
	config->exports.push_back(e);
	}

	// Handle /def
	if (auto *arg = args.getLastArg(OPT_deffile)) {
	// parseModuleDefs mutates Config object.
	parseModuleDefs(arg->getValue());
	}

	// Handle generation of import library from a def file.
	if (!args.hasArg(OPT_INPUT)) {
	fixupExports();
	createImportLibrary(/asLib=/true);
	return;
	}

	// Windows specific -- if no /subsystem is given, we need to infer
	// that from entry point name. Must happen before /entry handling,
	// and after the early return when just writing an import library.
	if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) {
	config->subsystem = inferSubsystem();
	if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN)
	fatal("subsystem must be defined");
	}

	// Handle /entry and /dll
	if (auto *arg = args.getLastArg(OPT_entry)) {
	config->entry = addUndefined(mangle(arg->getValue()));
	} else if (!config->entry && !config->noEntry) {
	if (args.hasArg(OPT_dll)) {
	StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12"
	: "_DllMainCRTStartup";
	config->entry = addUndefined(s);
	} else {
	// Windows specific -- If entry point name is not given, we need to
	// infer that from user-defined entry name.
	StringRef s = findDefaultEntry();
	if (s.empty())
	fatal("entry point must be defined");
	config->entry = addUndefined(s);
	log("Entry name inferred: " + s);
	}
	}

	// Handle /delayload
	for (auto *arg : args.filtered(OPT_delayload)) {
	config->delayLoads.insert(StringRef(arg->getValue()).lower());
	if (config->machine == I386) {
	config->delayLoadHelper = addUndefined("___delayLoadHelper2@8");
	} else {
	config->delayLoadHelper = addUndefined("__delayLoadHelper2");
	}
	}

	// Set default image name if neither /out or /def set it.
	if (config->outputFile.empty()) {
	config->outputFile =
	getOutputPath((*args.filtered(OPT_INPUT).begin())->getValue());
	}

	// Fail early if an output file is not writable.
	if (auto e = tryCreateFile(config->outputFile)) {
	error("cannot open output file " + config->outputFile + ": " + e.message());
	return;
	}

	if (shouldCreatePDB) {
	// Put the PDB next to the image if no /pdb flag was passed.
	if (config->pdbPath.empty()) {
	config->pdbPath = config->outputFile;
	sys::path::replace_extension(config->pdbPath, ".pdb");
	}

	// The embedded PDB path should be the absolute path to the PDB if no
	// /pdbaltpath flag was passed.
	if (config->pdbAltPath.empty()) {
	config->pdbAltPath = config->pdbPath;

	// It's important to make the path absolute and remove dots. This path
	// will eventually be written into the PE header, and certain Microsoft
	// tools won't work correctly if these assumptions are not held.
	sys::fs::make_absolute(config->pdbAltPath);
	sys::path::remove_dots(config->pdbAltPath);
	} else {
	// Don't do this earlier, so that Config->OutputFile is ready.
	parsePDBAltPath(config->pdbAltPath);
	}
	}

	// Set default image base if /base is not given.
	if (config->imageBase == uint64_t(-1))
	config->imageBase = getDefaultImageBase();

	symtab->addSynthetic(mangle("__ImageBase"), nullptr);
	if (config->machine == I386) {
	symtab->addAbsolute("___safe_se_handler_table", 0);
	symtab->addAbsolute("___safe_se_handler_count", 0);
	}

	symtab->addAbsolute(mangle("__guard_fids_count"), 0);
	symtab->addAbsolute(mangle("__guard_fids_table"), 0);
	symtab->addAbsolute(mangle("__guard_flags"), 0);
	symtab->addAbsolute(mangle("__guard_iat_count"), 0);
	symtab->addAbsolute(mangle("__guard_iat_table"), 0);
	symtab->addAbsolute(mangle("__guard_longjmp_count"), 0);
	symtab->addAbsolute(mangle("__guard_longjmp_table"), 0);
	// Needed for MSVC 2017 15.5 CRT.
	symtab->addAbsolute(mangle("__enclave_config"), 0);

	if (config->mingw) {
	symtab->addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST__"), 0);
	symtab->addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST_END__"), 0);
	symtab->addAbsolute(mangle("__CTOR_LIST__"), 0);
	symtab->addAbsolute(mangle("__DTOR_LIST__"), 0);
	}

	// This code may add new undefined symbols to the link, which may enqueue more
	// symbol resolution tasks, so we need to continue executing tasks until we
	// converge.
	do {
	// Windows specific -- if entry point is not found,
	// search for its mangled names.
	if (config->entry)
	mangleMaybe(config->entry);

	// Windows specific -- Make sure we resolve all dllexported symbols.
	for (Export &e : config->exports) {
	if (!e.forwardTo.empty())
	continue;
	e.sym = addUndefined(e.name);
	if (!e.directives)
	e.symbolName = mangleMaybe(e.sym);
	}

	// Add weak aliases. Weak aliases is a mechanism to give remaining
	// undefined symbols final chance to be resolved successfully.
	for (auto pair : config->alternateNames) {
	StringRef from = pair.first;
	StringRef to = pair.second;
	Symbol *sym = symtab->find(from);
	if (!sym)
	continue;
	if (auto *u = dyn_cast<Undefined>(sym))
	if (!u->weakAlias)
	u->weakAlias = symtab->addUndefined(to);
	}
	+
	+ // If any inputs are bitcode files, the LTO code generator may create
	+ // references to library functions that are not explicit in the bitcode
	+ // file's symbol table. If any of those library functions are defined in a
	+ // bitcode file in an archive member, we need to arrange to use LTO to
	+ // compile those archive members by adding them to the link beforehand.
	+ if (!BitcodeFile::instances.empty())
	+ for (const char *s : libcallRoutineNames)
	+ symtab->addLibcall(s);

	// Windows specific -- if __load_config_used can be resolved, resolve it.
	if (symtab->findUnderscore("_load_config_used"))
	addUndefined(mangle("_load_config_used"));
	} while (run());

	if (errorCount())
	return;

	// Do LTO by compiling bitcode input files to a set of native COFF files then
	// link those files (unless -thinlto-index-only was given, in which case we
	// resolve symbols and write indices, but don't generate native code or link).
	symtab->addCombinedLTOObjects();

	// If -thinlto-index-only is given, we should create only "index
	// files" and not object files. Index file creation is already done
	// in addCombinedLTOObject, so we are done if that's the case.
	if (config->thinLTOIndexOnly)
	return;

	// If we generated native object files from bitcode files, this resolves
	// references to the symbols we use from them.
	run();

	if (args.hasArg(OPT_include_optional)) {
	// Handle /includeoptional
	for (auto *arg : args.filtered(OPT_include_optional))
	if (dyn_cast_or_null<Lazy>(symtab->find(arg->getValue())))
	addUndefined(arg->getValue());
	while (run());
	}

	if (config->mingw) {
	// Load any further object files that might be needed for doing automatic
	// imports.
	//
	// For cases with no automatically imported symbols, this iterates once
	// over the symbol table and doesn't do anything.
	//
	// For the normal case with a few automatically imported symbols, this
	// should only need to be run once, since each new object file imported
	// is an import library and wouldn't add any new undefined references,
	// but there's nothing stopping the __imp_ symbols from coming from a
	// normal object file as well (although that won't be used for the
	// actual autoimport later on). If this pass adds new undefined references,
	// we won't iterate further to resolve them.
	symtab->loadMinGWAutomaticImports();
	run();
	}

	// Make sure we have resolved all symbols.
	symtab->reportRemainingUndefines();
	if (errorCount())
	return;

	if (config->mingw) {
	// In MinGW, all symbols are automatically exported if no symbols
	// are chosen to be exported.
	maybeExportMinGWSymbols(args);

	// Make sure the crtend.o object is the last object file. This object
	// file can contain terminating section chunks that need to be placed
	// last. GNU ld processes files and static libraries explicitly in the
	// order provided on the command line, while lld will pull in needed
	// files from static libraries only after the last object file on the
	// command line.
	for (auto i = ObjFile::instances.begin(), e = ObjFile::instances.end();
	i != e; i++) {
	ObjFile file = i;
	if (isCrtend(file->getName())) {
	ObjFile::instances.erase(i);
	ObjFile::instances.push_back(file);
	break;
	}
	}
	}

	// Windows specific -- when we are creating a .dll file, we also
	// need to create a .lib file.
	if (!config->exports.empty() \|\| config->dll) {
	fixupExports();
	createImportLibrary(/asLib=/false);
	assignExportOrdinals();
	}

	// Handle /output-def (MinGW specific).
	if (auto *arg = args.getLastArg(OPT_output_def))
	writeDefFile(arg->getValue());

	// Set extra alignment for .comm symbols
	for (auto pair : config->alignComm) {
	StringRef name = pair.first;
	uint32_t alignment = pair.second;

	Symbol *sym = symtab->find(name);
	if (!sym) {
	warn("/aligncomm symbol " + name + " not found");
	continue;
	}

	// If the symbol isn't common, it must have been replaced with a regular
	// symbol, which will carry its own alignment.
	auto *dc = dyn_cast<DefinedCommon>(sym);
	if (!dc)
	continue;

	CommonChunk *c = dc->getChunk();
	c->setAlignment(std::max(c->getAlignment(), alignment));
	}

	// Windows specific -- Create a side-by-side manifest file.
	if (config->manifest == Configuration::SideBySide)
	createSideBySideManifest();

	// Handle /order. We want to do this at this moment because we
	// need a complete list of comdat sections to warn on nonexistent
	// functions.
	if (auto *arg = args.getLastArg(OPT_order))
	parseOrderFile(arg->getValue());

	// Identify unreferenced COMDAT sections.
	if (config->doGC)
	markLive(symtab->getChunks());

	// Needs to happen after the last call to addFile().
	diagnoseMultipleResourceObjFiles();

	// Identify identical COMDAT sections to merge them.
	if (config->doICF) {
	findKeepUniqueSections();
	doICF(symtab->getChunks());
	}

	// Write the result.
	writeResult();

	// Stop early so we can print the results.
	Timer::root().stop();
	if (config->showTiming)
	Timer::root().print();
	}

	} // namespace coff
	} // namespace lld
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/Driver.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/Driver.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/Driver.h (revision 351722)
	@@ -1,202 +1,202 @@
	//===- Driver.h -------------------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLD_COFF_DRIVER_H
	#define LLD_COFF_DRIVER_H

	#include "Config.h"
	#include "SymbolTable.h"
	#include "lld/Common/LLVM.h"
	#include "lld/Common/Reproduce.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSet.h"
	#include "llvm/Object/Archive.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Option/Arg.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/TarWriter.h"
	#include <memory>
	#include <set>
	#include <vector>

	namespace lld {
	namespace coff {

	class LinkerDriver;
	extern LinkerDriver *driver;

	using llvm::COFF::MachineTypes;
	using llvm::COFF::WindowsSubsystem;
	using llvm::Optional;

	class COFFOptTable : public llvm::opt::OptTable {
	public:
	COFFOptTable();
	};

	class ArgParser {
	public:
	// Concatenate LINK environment variable and given arguments and parse them.
	llvm::opt::InputArgList parseLINK(std::vector<const char *> args);

	// Tokenizes a given string and then parses as command line options.
	llvm::opt::InputArgList parse(StringRef s) { return parse(tokenize(s)); }

	// Tokenizes a given string and then parses as command line options in
	// .drectve section. /EXPORT options are returned in second element
	// to be processed in fastpath.
	std::pair<llvm::opt::InputArgList, std::vector<StringRef>>
	parseDirectives(StringRef s);

	private:
	// Parses command line options.
	llvm::opt::InputArgList parse(llvm::ArrayRef<const char *> args);

	std::vector<const char *> tokenize(StringRef s);

	COFFOptTable table;
	};

	class LinkerDriver {
	public:
	void link(llvm::ArrayRef<const char *> args);

	// Used by the resolver to parse .drectve section contents.
	void parseDirectives(InputFile *file);

	// Used by ArchiveFile to enqueue members.
	- void enqueueArchiveMember(const Archive::Child &c, StringRef symName,
	+ void enqueueArchiveMember(const Archive::Child &c, const Archive::Symbol &sym,
	StringRef parentName);

	MemoryBufferRef takeBuffer(std::unique_ptr<MemoryBuffer> mb);

	void enqueuePath(StringRef path, bool wholeArchive);

	private:
	std::unique_ptr<llvm::TarWriter> tar; // for /linkrepro

	// Opens a file. Path has to be resolved already.
	MemoryBufferRef openFile(StringRef path);

	// Searches a file from search paths.
	Optional<StringRef> findFile(StringRef filename);
	Optional<StringRef> findLib(StringRef filename);
	StringRef doFindFile(StringRef filename);
	StringRef doFindLib(StringRef filename);
	StringRef doFindLibMinGW(StringRef filename);

	// Parses LIB environment which contains a list of search paths.
	void addLibSearchPaths();

	// Library search path. The first element is always "" (current directory).
	std::vector<StringRef> searchPaths;

	void maybeExportMinGWSymbols(const llvm::opt::InputArgList &args);

	// We don't want to add the same file more than once.
	// Files are uniquified by their filesystem and file number.
	std::set<llvm::sys::fs::UniqueID> visitedFiles;

	std::set<std::string> visitedLibs;

	Symbol *addUndefined(StringRef sym);

	StringRef mangleMaybe(Symbol *s);

	// Windows specific -- "main" is not the only main function in Windows.
	// You can choose one from these four -- {w,}{WinMain,main}.
	// There are four different entry point functions for them,
	// {w,}{WinMain,main}CRTStartup, respectively. The linker needs to
	// choose the right one depending on which "main" function is defined.
	// This function looks up the symbol table and resolve corresponding
	// entry point name.
	StringRef findDefaultEntry();
	WindowsSubsystem inferSubsystem();

	void addBuffer(std::unique_ptr<MemoryBuffer> mb, bool wholeArchive);
	void addArchiveBuffer(MemoryBufferRef mbref, StringRef symName,
	StringRef parentName, uint64_t offsetInArchive);

	void enqueueTask(std::function<void()> task);
	bool run();

	std::list<std::function<void()>> taskQueue;
	std::vector<StringRef> filePaths;
	std::vector<MemoryBufferRef> resources;

	llvm::StringSet<> directivesExports;
	};

	// Functions below this line are defined in DriverUtils.cpp.

	void printHelp(const char *argv0);

	// Parses a string in the form of "<integer>[,<integer>]".
	void parseNumbers(StringRef arg, uint64_t addr, uint64_t size = nullptr);

	void parseGuard(StringRef arg);

	// Parses a string in the form of "<integer>[.<integer>]".
	// Minor's default value is 0.
	void parseVersion(StringRef arg, uint32_t major, uint32_t minor);

	// Parses a string in the form of "<subsystem>[,<integer>[.<integer>]]".
	void parseSubsystem(StringRef arg, WindowsSubsystem sys, uint32_t major,
	uint32_t *minor);

	void parseAlternateName(StringRef);
	void parseMerge(StringRef);
	void parseSection(StringRef);
	void parseAligncomm(StringRef);

	// Parses a string in the form of "[:<integer>]"
	void parseFunctionPadMin(llvm::opt::Arg *a, llvm::COFF::MachineTypes machine);

	// Parses a string in the form of "EMBED[,=<integer>]\|NO".
	void parseManifest(StringRef arg);

	// Parses a string in the form of "level=<string>\|uiAccess=<string>"
	void parseManifestUAC(StringRef arg);

	// Parses a string in the form of "cd\|net[,(cd\|net)]*"
	void parseSwaprun(StringRef arg);

	// Create a resource file containing a manifest XML.
	std::unique_ptr<MemoryBuffer> createManifestRes();
	void createSideBySideManifest();

	// Used for dllexported symbols.
	Export parseExport(StringRef arg);
	void fixupExports();
	void assignExportOrdinals();

	// Parses a string in the form of "key=value" and check
	// if value matches previous values for the key.
	// This feature used in the directive section to reject
	// incompatible objects.
	void checkFailIfMismatch(StringRef arg, InputFile *source);

	// Convert Windows resource files (.res files) to a .obj file.
	MemoryBufferRef convertResToCOFF(ArrayRef<MemoryBufferRef> mbs);

	void runMSVCLinker(std::string rsp, ArrayRef<StringRef> objects);

	// Create enum with OPT_xxx values for each option in Options.td
	enum {
	OPT_INVALID = 0,
	#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
	#include "Options.inc"
	#undef OPTION
	};

	} // namespace coff
	} // namespace lld

	#endif
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/InputFiles.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/InputFiles.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/InputFiles.cpp (revision 351722)
	@@ -1,881 +1,881 @@
	//===- InputFiles.cpp -----------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "InputFiles.h"
	#include "Chunks.h"
	#include "Config.h"
	#include "DebugTypes.h"
	#include "Driver.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "llvm-c/lto.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/COFF.h"
	#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
	#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
	#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
	#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
	#include "llvm/Object/Binary.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Endian.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/ErrorOr.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cstring>
	#include <system_error>
	#include <utility>

	using namespace llvm;
	using namespace llvm::COFF;
	using namespace llvm::codeview;
	using namespace llvm::object;
	using namespace llvm::support::endian;

	using llvm::Triple;
	using llvm::support::ulittle32_t;

	namespace lld {
	namespace coff {

	std::vector<ObjFile *> ObjFile::instances;
	std::vector<ImportFile *> ImportFile::instances;
	std::vector<BitcodeFile *> BitcodeFile::instances;

	/// Checks that Source is compatible with being a weak alias to Target.
	/// If Source is Undefined and has no weak alias set, makes it a weak
	/// alias to Target.
	static void checkAndSetWeakAlias(SymbolTable symtab, InputFile f,
	Symbol source, Symbol target) {
	if (auto *u = dyn_cast<Undefined>(source)) {
	if (u->weakAlias && u->weakAlias != target) {
	// Weak aliases as produced by GCC are named in the form
	// .weak.<weaksymbol>.<othersymbol>, where <othersymbol> is the name
	// of another symbol emitted near the weak symbol.
	// Just use the definition from the first object file that defined
	// this weak symbol.
	if (config->mingw)
	return;
	symtab->reportDuplicate(source, f);
	}
	u->weakAlias = target;
	}
	}

	ArchiveFile::ArchiveFile(MemoryBufferRef m) : InputFile(ArchiveKind, m) {}

	void ArchiveFile::parse() {
	// Parse a MemoryBufferRef as an archive file.
	file = CHECK(Archive::create(mb), this);

	// Read the symbol table to construct Lazy objects.
	for (const Archive::Symbol &sym : file->symbols())
	symtab->addLazy(this, sym);
	}

	// Returns a buffer pointing to a member file containing a given symbol.
	-void ArchiveFile::addMember(const Archive::Symbol *sym) {
	+void ArchiveFile::addMember(const Archive::Symbol &sym) {
	const Archive::Child &c =
	- CHECK(sym->getMember(),
	- "could not get the member for symbol " + sym->getName());
	+ CHECK(sym.getMember(),
	+ "could not get the member for symbol " + toCOFFString(sym));

	// Return an empty buffer if we have already returned the same buffer.
	if (!seen.insert(c.getChildOffset()).second)
	return;

	- driver->enqueueArchiveMember(c, sym->getName(), getName());
	+ driver->enqueueArchiveMember(c, sym, getName());
	}

	std::vector<MemoryBufferRef> getArchiveMembers(Archive *file) {
	std::vector<MemoryBufferRef> v;
	Error err = Error::success();
	for (const ErrorOr<Archive::Child> &cOrErr : file->children(err)) {
	Archive::Child c =
	CHECK(cOrErr,
	file->getFileName() + ": could not get the child of the archive");
	MemoryBufferRef mbref =
	CHECK(c.getMemoryBufferRef(),
	file->getFileName() +
	": could not get the buffer for a child of the archive");
	v.push_back(mbref);
	}
	if (err)
	fatal(file->getFileName() +
	": Archive::children failed: " + toString(std::move(err)));
	return v;
	}

	void ObjFile::parse() {
	// Parse a memory buffer as a COFF file.
	std::unique_ptr<Binary> bin = CHECK(createBinary(mb), this);

	if (auto *obj = dyn_cast<COFFObjectFile>(bin.get())) {
	bin.release();
	coffObj.reset(obj);
	} else {
	fatal(toString(this) + " is not a COFF file");
	}

	// Read section and symbol tables.
	initializeChunks();
	initializeSymbols();
	initializeFlags();
	initializeDependencies();
	}

	const coff_section* ObjFile::getSection(uint32_t i) {
	const coff_section *sec;
	if (auto ec = coffObj->getSection(i, sec))
	fatal("getSection failed: #" + Twine(i) + ": " + ec.message());
	return sec;
	}

	// We set SectionChunk pointers in the SparseChunks vector to this value
	// temporarily to mark comdat sections as having an unknown resolution. As we
	// walk the object file's symbol table, once we visit either a leader symbol or
	// an associative section definition together with the parent comdat's leader,
	// we set the pointer to either nullptr (to mark the section as discarded) or a
	// valid SectionChunk for that section.
	static SectionChunk const pendingComdat = reinterpret_cast<SectionChunk >(1);

	void ObjFile::initializeChunks() {
	uint32_t numSections = coffObj->getNumberOfSections();
	chunks.reserve(numSections);
	sparseChunks.resize(numSections + 1);
	for (uint32_t i = 1; i < numSections + 1; ++i) {
	const coff_section *sec = getSection(i);
	if (sec->Characteristics & IMAGE_SCN_LNK_COMDAT)
	sparseChunks[i] = pendingComdat;
	else
	sparseChunks[i] = readSection(i, nullptr, "");
	}
	}

	SectionChunk *ObjFile::readSection(uint32_t sectionNumber,
	const coff_aux_section_definition *def,
	StringRef leaderName) {
	const coff_section *sec = getSection(sectionNumber);

	StringRef name;
	if (Expected<StringRef> e = coffObj->getSectionName(sec))
	name = *e;
	else
	fatal("getSectionName failed: #" + Twine(sectionNumber) + ": " +
	toString(e.takeError()));

	if (name == ".drectve") {
	ArrayRef<uint8_t> data;
	cantFail(coffObj->getSectionContents(sec, data));
	directives = StringRef((const char *)data.data(), data.size());
	return nullptr;
	}

	if (name == ".llvm_addrsig") {
	addrsigSec = sec;
	return nullptr;
	}

	// Object files may have DWARF debug info or MS CodeView debug info
	// (or both).
	//
	// DWARF sections don't need any special handling from the perspective
	// of the linker; they are just a data section containing relocations.
	// We can just link them to complete debug info.
	//
	// CodeView needs linker support. We need to interpret debug info,
	// and then write it to a separate .pdb file.

	// Ignore DWARF debug info unless /debug is given.
	if (!config->debug && name.startswith(".debug_"))
	return nullptr;

	if (sec->Characteristics & llvm::COFF::IMAGE_SCN_LNK_REMOVE)
	return nullptr;
	auto *c = make<SectionChunk>(this, sec);
	if (def)
	c->checksum = def->CheckSum;

	// link.exe uses the presence of .rsrc$01 for LNK4078, so match that.
	if (name == ".rsrc$01")
	isResourceObjFile = true;

	// CodeView sections are stored to a different vector because they are not
	// linked in the regular manner.
	if (c->isCodeView())
	debugChunks.push_back(c);
	else if (name == ".gfids$y")
	guardFidChunks.push_back(c);
	else if (name == ".gljmp$y")
	guardLJmpChunks.push_back(c);
	else if (name == ".sxdata")
	sXDataChunks.push_back(c);
	else if (config->tailMerge && sec->NumberOfRelocations == 0 &&
	name == ".rdata" && leaderName.startswith("??_C@"))
	// COFF sections that look like string literal sections (i.e. no
	// relocations, in .rdata, leader symbol name matches the MSVC name mangling
	// for string literals) are subject to string tail merging.
	MergeChunk::addSection(c);
	else
	chunks.push_back(c);

	return c;
	}

	void ObjFile::readAssociativeDefinition(
	COFFSymbolRef sym, const coff_aux_section_definition *def) {
	readAssociativeDefinition(sym, def, def->getNumber(sym.isBigObj()));
	}

	void ObjFile::readAssociativeDefinition(COFFSymbolRef sym,
	const coff_aux_section_definition *def,
	uint32_t parentIndex) {
	SectionChunk *parent = sparseChunks[parentIndex];
	int32_t sectionNumber = sym.getSectionNumber();

	auto diag = [&]() {
	StringRef name, parentName;
	coffObj->getSymbolName(sym, name);

	const coff_section *parentSec = getSection(parentIndex);
	if (Expected<StringRef> e = coffObj->getSectionName(parentSec))
	parentName = *e;
	error(toString(this) + ": associative comdat " + name + " (sec " +
	Twine(sectionNumber) + ") has invalid reference to section " +
	parentName + " (sec " + Twine(parentIndex) + ")");
	};

	if (parent == pendingComdat) {
	// This can happen if an associative comdat refers to another associative
	// comdat that appears after it (invalid per COFF spec) or to a section
	// without any symbols.
	diag();
	return;
	}

	// Check whether the parent is prevailing. If it is, so are we, and we read
	// the section; otherwise mark it as discarded.
	if (parent) {
	SectionChunk *c = readSection(sectionNumber, def, "");
	sparseChunks[sectionNumber] = c;
	if (c) {
	c->selection = IMAGE_COMDAT_SELECT_ASSOCIATIVE;
	parent->addAssociative(c);
	}
	} else {
	sparseChunks[sectionNumber] = nullptr;
	}
	}

	void ObjFile::recordPrevailingSymbolForMingw(
	COFFSymbolRef sym, DenseMap<StringRef, uint32_t> &prevailingSectionMap) {
	// For comdat symbols in executable sections, where this is the copy
	// of the section chunk we actually include instead of discarding it,
	// add the symbol to a map to allow using it for implicitly
	// associating .[px]data$<func> sections to it.
	int32_t sectionNumber = sym.getSectionNumber();
	SectionChunk *sc = sparseChunks[sectionNumber];
	if (sc && sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE) {
	StringRef name;
	coffObj->getSymbolName(sym, name);
	if (getMachineType() == I386)
	name.consume_front("_");
	prevailingSectionMap[name] = sectionNumber;
	}
	}

	void ObjFile::maybeAssociateSEHForMingw(
	COFFSymbolRef sym, const coff_aux_section_definition *def,
	const DenseMap<StringRef, uint32_t> &prevailingSectionMap) {
	StringRef name;
	coffObj->getSymbolName(sym, name);
	if (name.consume_front(".pdata$") \|\| name.consume_front(".xdata$") \|\|
	name.consume_front(".eh_frame$")) {
	// For MinGW, treat .[px]data$<func> and .eh_frame$<func> as implicitly
	// associative to the symbol <func>.
	auto parentSym = prevailingSectionMap.find(name);
	if (parentSym != prevailingSectionMap.end())
	readAssociativeDefinition(sym, def, parentSym->second);
	}
	}

	Symbol *ObjFile::createRegular(COFFSymbolRef sym) {
	SectionChunk *sc = sparseChunks[sym.getSectionNumber()];
	if (sym.isExternal()) {
	StringRef name;
	coffObj->getSymbolName(sym, name);
	if (sc)
	return symtab->addRegular(this, name, sym.getGeneric(), sc);
	// For MinGW symbols named .weak.* that point to a discarded section,
	// don't create an Undefined symbol. If nothing ever refers to the symbol,
	// everything should be fine. If something actually refers to the symbol
	// (e.g. the undefined weak alias), linking will fail due to undefined
	// references at the end.
	if (config->mingw && name.startswith(".weak."))
	return nullptr;
	return symtab->addUndefined(name, this, false);
	}
	if (sc)
	return make<DefinedRegular>(this, /Name/ "", /IsCOMDAT/ false,
	/IsExternal/ false, sym.getGeneric(), sc);
	return nullptr;
	}

	void ObjFile::initializeSymbols() {
	uint32_t numSymbols = coffObj->getNumberOfSymbols();
	symbols.resize(numSymbols);

	SmallVector<std::pair<Symbol *, uint32_t>, 8> weakAliases;
	std::vector<uint32_t> pendingIndexes;
	pendingIndexes.reserve(numSymbols);

	DenseMap<StringRef, uint32_t> prevailingSectionMap;
	std::vector<const coff_aux_section_definition *> comdatDefs(
	coffObj->getNumberOfSections() + 1);

	for (uint32_t i = 0; i < numSymbols; ++i) {
	COFFSymbolRef coffSym = check(coffObj->getSymbol(i));
	bool prevailingComdat;
	if (coffSym.isUndefined()) {
	symbols[i] = createUndefined(coffSym);
	} else if (coffSym.isWeakExternal()) {
	symbols[i] = createUndefined(coffSym);
	uint32_t tagIndex = coffSym.getAux<coff_aux_weak_external>()->TagIndex;
	weakAliases.emplace_back(symbols[i], tagIndex);
	} else if (Optional<Symbol *> optSym =
	createDefined(coffSym, comdatDefs, prevailingComdat)) {
	symbols[i] = *optSym;
	if (config->mingw && prevailingComdat)
	recordPrevailingSymbolForMingw(coffSym, prevailingSectionMap);
	} else {
	// createDefined() returns None if a symbol belongs to a section that
	// was pending at the point when the symbol was read. This can happen in
	// two cases:
	// 1) section definition symbol for a comdat leader;
	// 2) symbol belongs to a comdat section associated with another section.
	// In both of these cases, we can expect the section to be resolved by
	// the time we finish visiting the remaining symbols in the symbol
	// table. So we postpone the handling of this symbol until that time.
	pendingIndexes.push_back(i);
	}
	i += coffSym.getNumberOfAuxSymbols();
	}

	for (uint32_t i : pendingIndexes) {
	COFFSymbolRef sym = check(coffObj->getSymbol(i));
	if (const coff_aux_section_definition *def = sym.getSectionDefinition()) {
	if (def->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE)
	readAssociativeDefinition(sym, def);
	else if (config->mingw)
	maybeAssociateSEHForMingw(sym, def, prevailingSectionMap);
	}
	if (sparseChunks[sym.getSectionNumber()] == pendingComdat) {
	StringRef name;
	coffObj->getSymbolName(sym, name);
	log("comdat section " + name +
	" without leader and unassociated, discarding");
	continue;
	}
	symbols[i] = createRegular(sym);
	}

	for (auto &kv : weakAliases) {
	Symbol *sym = kv.first;
	uint32_t idx = kv.second;
	checkAndSetWeakAlias(symtab, this, sym, symbols[idx]);
	}
	}

	Symbol *ObjFile::createUndefined(COFFSymbolRef sym) {
	StringRef name;
	coffObj->getSymbolName(sym, name);
	return symtab->addUndefined(name, this, sym.isWeakExternal());
	}

	void ObjFile::handleComdatSelection(COFFSymbolRef sym, COMDATType &selection,
	bool &prevailing, DefinedRegular *leader) {
	if (prevailing)
	return;
	// There's already an existing comdat for this symbol: `Leader`.
	// Use the comdats's selection field to determine if the new
	// symbol in `Sym` should be discarded, produce a duplicate symbol
	// error, etc.

	SectionChunk *leaderChunk = nullptr;
	COMDATType leaderSelection = IMAGE_COMDAT_SELECT_ANY;

	if (leader->data) {
	leaderChunk = leader->getChunk();
	leaderSelection = leaderChunk->selection;
	} else {
	// FIXME: comdats from LTO files don't know their selection; treat them
	// as "any".
	selection = leaderSelection;
	}

	if ((selection == IMAGE_COMDAT_SELECT_ANY &&
	leaderSelection == IMAGE_COMDAT_SELECT_LARGEST) \|\|
	(selection == IMAGE_COMDAT_SELECT_LARGEST &&
	leaderSelection == IMAGE_COMDAT_SELECT_ANY)) {
	// cl.exe picks "any" for vftables when building with /GR- and
	// "largest" when building with /GR. To be able to link object files
	// compiled with each flag, "any" and "largest" are merged as "largest".
	leaderSelection = selection = IMAGE_COMDAT_SELECT_LARGEST;
	}

	// Other than that, comdat selections must match. This is a bit more
	// strict than link.exe which allows merging "any" and "largest" if "any"
	// is the first symbol the linker sees, and it allows merging "largest"
	// with everything (!) if "largest" is the first symbol the linker sees.
	// Making this symmetric independent of which selection is seen first
	// seems better though.
	// (This behavior matches ModuleLinker::getComdatResult().)
	if (selection != leaderSelection) {
	log(("conflicting comdat type for " + toString(*leader) + ": " +
	Twine((int)leaderSelection) + " in " + toString(leader->getFile()) +
	" and " + Twine((int)selection) + " in " + toString(this))
	.str());
	symtab->reportDuplicate(leader, this);
	return;
	}

	switch (selection) {
	case IMAGE_COMDAT_SELECT_NODUPLICATES:
	symtab->reportDuplicate(leader, this);
	break;

	case IMAGE_COMDAT_SELECT_ANY:
	// Nothing to do.
	break;

	case IMAGE_COMDAT_SELECT_SAME_SIZE:
	if (leaderChunk->getSize() != getSection(sym)->SizeOfRawData)
	symtab->reportDuplicate(leader, this);
	break;

	case IMAGE_COMDAT_SELECT_EXACT_MATCH: {
	SectionChunk newChunk(this, getSection(sym));
	// link.exe only compares section contents here and doesn't complain
	// if the two comdat sections have e.g. different alignment.
	// Match that.
	if (leaderChunk->getContents() != newChunk.getContents())
	symtab->reportDuplicate(leader, this);
	break;
	}

	case IMAGE_COMDAT_SELECT_ASSOCIATIVE:
	// createDefined() is never called for IMAGE_COMDAT_SELECT_ASSOCIATIVE.
	// (This means lld-link doesn't produce duplicate symbol errors for
	// associative comdats while link.exe does, but associate comdats
	// are never extern in practice.)
	llvm_unreachable("createDefined not called for associative comdats");

	case IMAGE_COMDAT_SELECT_LARGEST:
	if (leaderChunk->getSize() < getSection(sym)->SizeOfRawData) {
	// Replace the existing comdat symbol with the new one.
	StringRef name;
	coffObj->getSymbolName(sym, name);
	// FIXME: This is incorrect: With /opt:noref, the previous sections
	// make it into the final executable as well. Correct handling would
	// be to undo reading of the whole old section that's being replaced,
	// or doing one pass that determines what the final largest comdat
	// is for all IMAGE_COMDAT_SELECT_LARGEST comdats and then reading
	// only the largest one.
	replaceSymbol<DefinedRegular>(leader, this, name, /IsCOMDAT/ true,
	/IsExternal/ true, sym.getGeneric(),
	nullptr);
	prevailing = true;
	}
	break;

	case IMAGE_COMDAT_SELECT_NEWEST:
	llvm_unreachable("should have been rejected earlier");
	}
	}

	Optional<Symbol *> ObjFile::createDefined(
	COFFSymbolRef sym,
	std::vector<const coff_aux_section_definition *> &comdatDefs,
	bool &prevailing) {
	prevailing = false;
	auto getName = [&]() {
	StringRef s;
	coffObj->getSymbolName(sym, s);
	return s;
	};

	if (sym.isCommon()) {
	auto *c = make<CommonChunk>(sym);
	chunks.push_back(c);
	return symtab->addCommon(this, getName(), sym.getValue(), sym.getGeneric(),
	c);
	}

	if (sym.isAbsolute()) {
	StringRef name = getName();

	// Skip special symbols.
	if (name == "@comp.id")
	return nullptr;
	if (name == "@feat.00") {
	feat00Flags = sym.getValue();
	return nullptr;
	}

	if (sym.isExternal())
	return symtab->addAbsolute(name, sym);
	return make<DefinedAbsolute>(name, sym);
	}

	int32_t sectionNumber = sym.getSectionNumber();
	if (sectionNumber == llvm::COFF::IMAGE_SYM_DEBUG)
	return nullptr;

	if (llvm::COFF::isReservedSectionNumber(sectionNumber))
	fatal(toString(this) + ": " + getName() +
	" should not refer to special section " + Twine(sectionNumber));

	if ((uint32_t)sectionNumber >= sparseChunks.size())
	fatal(toString(this) + ": " + getName() +
	" should not refer to non-existent section " + Twine(sectionNumber));

	// Comdat handling.
	// A comdat symbol consists of two symbol table entries.
	// The first symbol entry has the name of the section (e.g. .text), fixed
	// values for the other fields, and one auxilliary record.
	// The second symbol entry has the name of the comdat symbol, called the
	// "comdat leader".
	// When this function is called for the first symbol entry of a comdat,
	// it sets comdatDefs and returns None, and when it's called for the second
	// symbol entry it reads comdatDefs and then sets it back to nullptr.

	// Handle comdat leader.
	if (const coff_aux_section_definition *def = comdatDefs[sectionNumber]) {
	comdatDefs[sectionNumber] = nullptr;
	DefinedRegular *leader;

	if (sym.isExternal()) {
	std::tie(leader, prevailing) =
	symtab->addComdat(this, getName(), sym.getGeneric());
	} else {
	leader = make<DefinedRegular>(this, /Name/ "", /IsCOMDAT/ false,
	/IsExternal/ false, sym.getGeneric());
	prevailing = true;
	}

	if (def->Selection < (int)IMAGE_COMDAT_SELECT_NODUPLICATES \|\|
	// Intentionally ends at IMAGE_COMDAT_SELECT_LARGEST: link.exe
	// doesn't understand IMAGE_COMDAT_SELECT_NEWEST either.
	def->Selection > (int)IMAGE_COMDAT_SELECT_LARGEST) {
	fatal("unknown comdat type " + std::to_string((int)def->Selection) +
	" for " + getName() + " in " + toString(this));
	}
	COMDATType selection = (COMDATType)def->Selection;

	if (leader->isCOMDAT)
	handleComdatSelection(sym, selection, prevailing, leader);

	if (prevailing) {
	SectionChunk *c = readSection(sectionNumber, def, getName());
	sparseChunks[sectionNumber] = c;
	c->sym = cast<DefinedRegular>(leader);
	c->selection = selection;
	cast<DefinedRegular>(leader)->data = &c->repl;
	} else {
	sparseChunks[sectionNumber] = nullptr;
	}
	return leader;
	}

	// Prepare to handle the comdat leader symbol by setting the section's
	// ComdatDefs pointer if we encounter a non-associative comdat.
	if (sparseChunks[sectionNumber] == pendingComdat) {
	if (const coff_aux_section_definition *def = sym.getSectionDefinition()) {
	if (def->Selection != IMAGE_COMDAT_SELECT_ASSOCIATIVE)
	comdatDefs[sectionNumber] = def;
	}
	return None;
	}

	return createRegular(sym);
	}

	MachineTypes ObjFile::getMachineType() {
	if (coffObj)
	return static_cast<MachineTypes>(coffObj->getMachine());
	return IMAGE_FILE_MACHINE_UNKNOWN;
	}

	ArrayRef<uint8_t> ObjFile::getDebugSection(StringRef secName) {
	if (SectionChunk *sec = SectionChunk::findByName(debugChunks, secName))
	return sec->consumeDebugMagic();
	return {};
	}

	// OBJ files systematically store critical informations in a .debug$S stream,
	// even if the TU was compiled with no debug info. At least two records are
	// always there. S_OBJNAME stores a 32-bit signature, which is loaded into the
	// PCHSignature member. S_COMPILE3 stores compile-time cmd-line flags. This is
	// currently used to initialize the hotPatchable member.
	void ObjFile::initializeFlags() {
	ArrayRef<uint8_t> data = getDebugSection(".debug$S");
	if (data.empty())
	return;

	DebugSubsectionArray subsections;

	BinaryStreamReader reader(data, support::little);
	ExitOnError exitOnErr;
	exitOnErr(reader.readArray(subsections, data.size()));

	for (const DebugSubsectionRecord &ss : subsections) {
	if (ss.kind() != DebugSubsectionKind::Symbols)
	continue;

	unsigned offset = 0;

	// Only parse the first two records. We are only looking for S_OBJNAME
	// and S_COMPILE3, and they usually appear at the beginning of the
	// stream.
	for (unsigned i = 0; i < 2; ++i) {
	Expected<CVSymbol> sym = readSymbolFromStream(ss.getRecordData(), offset);
	if (!sym) {
	consumeError(sym.takeError());
	return;
	}
	if (sym->kind() == SymbolKind::S_COMPILE3) {
	auto cs =
	cantFail(SymbolDeserializer::deserializeAs<Compile3Sym>(sym.get()));
	hotPatchable =
	(cs.Flags & CompileSym3Flags::HotPatch) != CompileSym3Flags::None;
	}
	if (sym->kind() == SymbolKind::S_OBJNAME) {
	auto objName = cantFail(SymbolDeserializer::deserializeAs<ObjNameSym>(
	sym.get()));
	pchSignature = objName.Signature;
	}
	offset += sym->length();
	}
	}
	}

	// Depending on the compilation flags, OBJs can refer to external files,
	// necessary to merge this OBJ into the final PDB. We currently support two
	// types of external files: Precomp/PCH OBJs, when compiling with /Yc and /Yu.
	// And PDB type servers, when compiling with /Zi. This function extracts these
	// dependencies and makes them available as a TpiSource interface (see
	// DebugTypes.h). Both cases only happen with cl.exe: clang-cl produces regular
	// output even with /Yc and /Yu and with /Zi.
	void ObjFile::initializeDependencies() {
	if (!config->debug)
	return;

	bool isPCH = false;

	ArrayRef<uint8_t> data = getDebugSection(".debug$P");
	if (!data.empty())
	isPCH = true;
	else
	data = getDebugSection(".debug$T");

	if (data.empty())
	return;

	CVTypeArray types;
	BinaryStreamReader reader(data, support::little);
	cantFail(reader.readArray(types, reader.getLength()));

	CVTypeArray::Iterator firstType = types.begin();
	if (firstType == types.end())
	return;

	debugTypes.emplace(types);

	if (isPCH) {
	debugTypesObj = makePrecompSource(this);
	return;
	}

	if (firstType->kind() == LF_TYPESERVER2) {
	TypeServer2Record ts = cantFail(
	TypeDeserializer::deserializeAs<TypeServer2Record>(firstType->data()));
	debugTypesObj = makeUseTypeServerSource(this, &ts);
	return;
	}

	if (firstType->kind() == LF_PRECOMP) {
	PrecompRecord precomp = cantFail(
	TypeDeserializer::deserializeAs<PrecompRecord>(firstType->data()));
	debugTypesObj = makeUsePrecompSource(this, &precomp);
	return;
	}

	debugTypesObj = makeTpiSource(this);
	}

	StringRef ltrim1(StringRef s, const char *chars) {
	if (!s.empty() && strchr(chars, s[0]))
	return s.substr(1);
	return s;
	}

	void ImportFile::parse() {
	const char *buf = mb.getBufferStart();
	const auto hdr = reinterpret_cast<const coff_import_header >(buf);

	// Check if the total size is valid.
	if (mb.getBufferSize() != sizeof(*hdr) + hdr->SizeOfData)
	fatal("broken import library");

	// Read names and create an __imp_ symbol.
	StringRef name = saver.save(StringRef(buf + sizeof(*hdr)));
	StringRef impName = saver.save("__imp_" + name);
	const char *nameStart = buf + sizeof(coff_import_header) + name.size() + 1;
	dllName = StringRef(nameStart);
	StringRef extName;
	switch (hdr->getNameType()) {
	case IMPORT_ORDINAL:
	extName = "";
	break;
	case IMPORT_NAME:
	extName = name;
	break;
	case IMPORT_NAME_NOPREFIX:
	extName = ltrim1(name, "?@_");
	break;
	case IMPORT_NAME_UNDECORATE:
	extName = ltrim1(name, "?@_");
	extName = extName.substr(0, extName.find('@'));
	break;
	}

	this->hdr = hdr;
	externalName = extName;

	impSym = symtab->addImportData(impName, this);
	// If this was a duplicate, we logged an error but may continue;
	// in this case, impSym is nullptr.
	if (!impSym)
	return;

	if (hdr->getType() == llvm::COFF::IMPORT_CONST)
	static_cast<void>(symtab->addImportData(name, this));

	// If type is function, we need to create a thunk which jump to an
	// address pointed by the __imp_ symbol. (This allows you to call
	// DLL functions just like regular non-DLL functions.)
	if (hdr->getType() == llvm::COFF::IMPORT_CODE)
	thunkSym = symtab->addImportThunk(
	name, cast_or_null<DefinedImportData>(impSym), hdr->Machine);
	}

	BitcodeFile::BitcodeFile(MemoryBufferRef mb, StringRef archiveName,
	uint64_t offsetInArchive)
	: InputFile(BitcodeKind, mb) {
	std::string path = mb.getBufferIdentifier().str();
	if (config->thinLTOIndexOnly)
	path = replaceThinLTOSuffix(mb.getBufferIdentifier());

	// ThinLTO assumes that all MemoryBufferRefs given to it have a unique
	// name. If two archives define two members with the same name, this
	// causes a collision which result in only one of the objects being taken
	// into consideration at LTO time (which very likely causes undefined
	// symbols later in the link stage). So we append file offset to make
	// filename unique.
	MemoryBufferRef mbref(
	mb.getBuffer(),
	saver.save(archiveName + path +
	(archiveName.empty() ? "" : utostr(offsetInArchive))));

	obj = check(lto::InputFile::create(mbref));
	}

	void BitcodeFile::parse() {
	std::vector<std::pair<Symbol *, bool>> comdat(obj->getComdatTable().size());
	for (size_t i = 0; i != obj->getComdatTable().size(); ++i)
	// FIXME: lto::InputFile doesn't keep enough data to do correct comdat
	// selection handling.
	comdat[i] = symtab->addComdat(this, saver.save(obj->getComdatTable()[i]));
	for (const lto::InputFile::Symbol &objSym : obj->symbols()) {
	StringRef symName = saver.save(objSym.getName());
	int comdatIndex = objSym.getComdatIndex();
	Symbol *sym;
	if (objSym.isUndefined()) {
	sym = symtab->addUndefined(symName, this, false);
	} else if (objSym.isCommon()) {
	sym = symtab->addCommon(this, symName, objSym.getCommonSize());
	} else if (objSym.isWeak() && objSym.isIndirect()) {
	// Weak external.
	sym = symtab->addUndefined(symName, this, true);
	std::string fallback = objSym.getCOFFWeakExternalFallback();
	Symbol *alias = symtab->addUndefined(saver.save(fallback));
	checkAndSetWeakAlias(symtab, this, sym, alias);
	} else if (comdatIndex != -1) {
	if (symName == obj->getComdatTable()[comdatIndex])
	sym = comdat[comdatIndex].first;
	else if (comdat[comdatIndex].second)
	sym = symtab->addRegular(this, symName);
	else
	sym = symtab->addUndefined(symName, this, false);
	} else {
	sym = symtab->addRegular(this, symName);
	}
	symbols.push_back(sym);
	if (objSym.isUsed())
	config->gcroot.push_back(sym);
	}
	directives = obj->getCOFFLinkerOpts();
	}

	MachineTypes BitcodeFile::getMachineType() {
	switch (Triple(obj->getTargetTriple()).getArch()) {
	case Triple::x86_64:
	return AMD64;
	case Triple::x86:
	return I386;
	case Triple::arm:
	return ARMNT;
	case Triple::aarch64:
	return ARM64;
	default:
	return IMAGE_FILE_MACHINE_UNKNOWN;
	}
	}

	std::string replaceThinLTOSuffix(StringRef path) {
	StringRef suffix = config->thinLTOObjectSuffixReplace.first;
	StringRef repl = config->thinLTOObjectSuffixReplace.second;

	if (path.consume_back(suffix))
	return (path + repl).str();
	return path;
	}
	} // namespace coff
	} // namespace lld

	// Returns the last element of a path, which is supposed to be a filename.
	static StringRef getBasename(StringRef path) {
	return sys::path::filename(path, sys::path::Style::windows);
	}

	// Returns a string in the format of "foo.obj" or "foo.obj(bar.lib)".
	std::string lld::toString(const coff::InputFile *file) {
	if (!file)
	return "<internal>";
	if (file->parentName.empty() \|\| file->kind() == coff::InputFile::ImportKind)
	return file->getName();

	return (getBasename(file->parentName) + "(" + getBasename(file->getName()) +
	")")
	.str();
	}
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/InputFiles.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/InputFiles.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/InputFiles.h (revision 351722)
	@@ -1,321 +1,321 @@
	//===- InputFiles.h ---------------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLD_COFF_INPUT_FILES_H
	#define LLD_COFF_INPUT_FILES_H

	#include "Config.h"
	#include "lld/Common/LLVM.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/DebugInfo/CodeView/TypeRecord.h"
	#include "llvm/LTO/LTO.h"
	#include "llvm/Object/Archive.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/StringSaver.h"
	#include <memory>
	#include <set>
	#include <vector>

	namespace llvm {
	namespace pdb {
	class DbiModuleDescriptorBuilder;
	}
	}

	namespace lld {
	namespace coff {

	std::vector<MemoryBufferRef> getArchiveMembers(llvm::object::Archive *file);

	using llvm::COFF::IMAGE_FILE_MACHINE_UNKNOWN;
	using llvm::COFF::MachineTypes;
	using llvm::object::Archive;
	using llvm::object::COFFObjectFile;
	using llvm::object::COFFSymbolRef;
	using llvm::object::coff_import_header;
	using llvm::object::coff_section;

	class Chunk;
	class Defined;
	class DefinedImportData;
	class DefinedImportThunk;
	class DefinedRegular;
	class Lazy;
	class SectionChunk;
	class Symbol;
	class Undefined;
	class TpiSource;

	// The root class of input files.
	class InputFile {
	public:
	enum Kind { ArchiveKind, ObjectKind, ImportKind, BitcodeKind };
	Kind kind() const { return fileKind; }
	virtual ~InputFile() {}

	// Returns the filename.
	StringRef getName() const { return mb.getBufferIdentifier(); }

	// Reads a file (the constructor doesn't do that).
	virtual void parse() = 0;

	// Returns the CPU type this file was compiled to.
	virtual MachineTypes getMachineType() { return IMAGE_FILE_MACHINE_UNKNOWN; }

	MemoryBufferRef mb;

	// An archive file name if this file is created from an archive.
	StringRef parentName;

	// Returns .drectve section contents if exist.
	StringRef getDirectives() { return directives; }

	protected:
	InputFile(Kind k, MemoryBufferRef m) : mb(m), fileKind(k) {}

	StringRef directives;

	private:
	const Kind fileKind;
	};

	// .lib or .a file.
	class ArchiveFile : public InputFile {
	public:
	explicit ArchiveFile(MemoryBufferRef m);
	static bool classof(const InputFile *f) { return f->kind() == ArchiveKind; }
	void parse() override;

	// Enqueues an archive member load for the given symbol. If we've already
	// enqueued a load for the same archive member, this function does nothing,
	// which ensures that we don't load the same member more than once.
	- void addMember(const Archive::Symbol *sym);
	+ void addMember(const Archive::Symbol &sym);

	private:
	std::unique_ptr<Archive> file;
	llvm::DenseSet<uint64_t> seen;
	};

	// .obj or .o file. This may be a member of an archive file.
	class ObjFile : public InputFile {
	public:
	explicit ObjFile(MemoryBufferRef m) : InputFile(ObjectKind, m) {}
	static bool classof(const InputFile *f) { return f->kind() == ObjectKind; }
	void parse() override;
	MachineTypes getMachineType() override;
	ArrayRef<Chunk *> getChunks() { return chunks; }
	ArrayRef<SectionChunk *> getDebugChunks() { return debugChunks; }
	ArrayRef<SectionChunk *> getSXDataChunks() { return sXDataChunks; }
	ArrayRef<SectionChunk *> getGuardFidChunks() { return guardFidChunks; }
	ArrayRef<SectionChunk *> getGuardLJmpChunks() { return guardLJmpChunks; }
	ArrayRef<Symbol *> getSymbols() { return symbols; }

	ArrayRef<uint8_t> getDebugSection(StringRef secName);

	// Returns a Symbol object for the symbolIndex'th symbol in the
	// underlying object file.
	Symbol *getSymbol(uint32_t symbolIndex) {
	return symbols[symbolIndex];
	}

	// Returns the underlying COFF file.
	COFFObjectFile *getCOFFObj() { return coffObj.get(); }

	// Add a symbol for a range extension thunk. Return the new symbol table
	// index. This index can be used to modify a relocation.
	uint32_t addRangeThunkSymbol(Symbol *thunk) {
	symbols.push_back(thunk);
	return symbols.size() - 1;
	}

	static std::vector<ObjFile *> instances;

	// Flags in the absolute @feat.00 symbol if it is present. These usually
	// indicate if an object was compiled with certain security features enabled
	// like stack guard, safeseh, /guard:cf, or other things.
	uint32_t feat00Flags = 0;

	// True if this object file is compatible with SEH. COFF-specific and
	// x86-only. COFF spec 5.10.1. The .sxdata section.
	bool hasSafeSEH() { return feat00Flags & 0x1; }

	// True if this file was compiled with /guard:cf.
	bool hasGuardCF() { return feat00Flags & 0x800; }

	// Pointer to the PDB module descriptor builder. Various debug info records
	// will reference object files by "module index", which is here. Things like
	// source files and section contributions are also recorded here. Will be null
	// if we are not producing a PDB.
	llvm::pdb::DbiModuleDescriptorBuilder *moduleDBI = nullptr;

	const coff_section *addrsigSec = nullptr;

	// When using Microsoft precompiled headers, this is the PCH's key.
	// The same key is used by both the precompiled object, and objects using the
	// precompiled object. Any difference indicates out-of-date objects.
	llvm::Optional<uint32_t> pchSignature;

	// Whether this is an object file created from .res files.
	bool isResourceObjFile = false;

	// Whether this file was compiled with /hotpatch.
	bool hotPatchable = false;

	// Whether the object was already merged into the final PDB.
	bool mergedIntoPDB = false;

	// If the OBJ has a .debug$T stream, this tells how it will be handled.
	TpiSource *debugTypesObj = nullptr;

	// The .debug$T stream if there's one.
	llvm::Optional<llvm::codeview::CVTypeArray> debugTypes;

	private:
	const coff_section* getSection(uint32_t i);
	const coff_section *getSection(COFFSymbolRef sym) {
	return getSection(sym.getSectionNumber());
	}

	void initializeChunks();
	void initializeSymbols();
	void initializeFlags();
	void initializeDependencies();

	SectionChunk *
	readSection(uint32_t sectionNumber,
	const llvm::object::coff_aux_section_definition *def,
	StringRef leaderName);

	void readAssociativeDefinition(
	COFFSymbolRef coffSym,
	const llvm::object::coff_aux_section_definition *def);

	void readAssociativeDefinition(
	COFFSymbolRef coffSym,
	const llvm::object::coff_aux_section_definition *def,
	uint32_t parentSection);

	void recordPrevailingSymbolForMingw(
	COFFSymbolRef coffSym,
	llvm::DenseMap<StringRef, uint32_t> &prevailingSectionMap);

	void maybeAssociateSEHForMingw(
	COFFSymbolRef sym, const llvm::object::coff_aux_section_definition *def,
	const llvm::DenseMap<StringRef, uint32_t> &prevailingSectionMap);

	// Given a new symbol Sym with comdat selection Selection, if the new
	// symbol is not (yet) Prevailing and the existing comdat leader set to
	// Leader, emits a diagnostic if the new symbol and its selection doesn't
	// match the existing symbol and its selection. If either old or new
	// symbol have selection IMAGE_COMDAT_SELECT_LARGEST, Sym might replace
	// the existing leader. In that case, Prevailing is set to true.
	void handleComdatSelection(COFFSymbolRef sym,
	llvm::COFF::COMDATType &selection,
	bool &prevailing, DefinedRegular *leader);

	llvm::Optional<Symbol *>
	createDefined(COFFSymbolRef sym,
	std::vector<const llvm::object::coff_aux_section_definition *>
	&comdatDefs,
	bool &prevailingComdat);
	Symbol *createRegular(COFFSymbolRef sym);
	Symbol *createUndefined(COFFSymbolRef sym);

	std::unique_ptr<COFFObjectFile> coffObj;

	// List of all chunks defined by this file. This includes both section
	// chunks and non-section chunks for common symbols.
	std::vector<Chunk *> chunks;

	// CodeView debug info sections.
	std::vector<SectionChunk *> debugChunks;

	// Chunks containing symbol table indices of exception handlers. Only used for
	// 32-bit x86.
	std::vector<SectionChunk *> sXDataChunks;

	// Chunks containing symbol table indices of address taken symbols and longjmp
	// targets. These are not linked into the final binary when /guard:cf is set.
	std::vector<SectionChunk *> guardFidChunks;
	std::vector<SectionChunk *> guardLJmpChunks;

	// This vector contains the same chunks as Chunks, but they are
	// indexed such that you can get a SectionChunk by section index.
	// Nonexistent section indices are filled with null pointers.
	// (Because section number is 1-based, the first slot is always a
	// null pointer.)
	std::vector<SectionChunk *> sparseChunks;

	// This vector contains a list of all symbols defined or referenced by this
	// file. They are indexed such that you can get a Symbol by symbol
	// index. Nonexistent indices (which are occupied by auxiliary
	// symbols in the real symbol table) are filled with null pointers.
	std::vector<Symbol *> symbols;
	};

	// This type represents import library members that contain DLL names
	// and symbols exported from the DLLs. See Microsoft PE/COFF spec. 7
	// for details about the format.
	class ImportFile : public InputFile {
	public:
	explicit ImportFile(MemoryBufferRef m) : InputFile(ImportKind, m) {}

	static bool classof(const InputFile *f) { return f->kind() == ImportKind; }

	static std::vector<ImportFile *> instances;

	Symbol *impSym = nullptr;
	Symbol *thunkSym = nullptr;
	std::string dllName;

	private:
	void parse() override;

	public:
	StringRef externalName;
	const coff_import_header *hdr;
	Chunk *location = nullptr;

	// We want to eliminate dllimported symbols if no one actually refers them.
	// These "Live" bits are used to keep track of which import library members
	// are actually in use.
	//
	// If the Live bit is turned off by MarkLive, Writer will ignore dllimported
	// symbols provided by this import library member. We also track whether the
	// imported symbol is used separately from whether the thunk is used in order
	// to avoid creating unnecessary thunks.
	bool live = !config->doGC;
	bool thunkLive = !config->doGC;
	};

	// Used for LTO.
	class BitcodeFile : public InputFile {
	public:
	BitcodeFile(MemoryBufferRef mb, StringRef archiveName,
	uint64_t offsetInArchive);
	static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; }
	ArrayRef<Symbol *> getSymbols() { return symbols; }
	MachineTypes getMachineType() override;
	static std::vector<BitcodeFile *> instances;
	std::unique_ptr<llvm::lto::InputFile> obj;

	private:
	void parse() override;

	std::vector<Symbol *> symbols;
	};

	std::string replaceThinLTOSuffix(StringRef path);
	} // namespace coff

	std::string toString(const coff::InputFile *file);
	} // namespace lld

	#endif
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/SymbolTable.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/SymbolTable.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/SymbolTable.cpp (revision 351722)
	@@ -1,603 +1,615 @@
	//===- SymbolTable.cpp ----------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "SymbolTable.h"
	#include "Config.h"
	#include "Driver.h"
	#include "LTO.h"
	#include "PDB.h"
	#include "Symbols.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "lld/Common/Timer.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/Object/WindowsMachineFlag.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include <utility>

	using namespace llvm;

	namespace lld {
	namespace coff {

	static Timer ltoTimer("LTO", Timer::root());

	SymbolTable *symtab;

	void SymbolTable::addFile(InputFile *file) {
	log("Reading " + toString(file));
	file->parse();

	MachineTypes mt = file->getMachineType();
	if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN) {
	config->machine = mt;
	} else if (mt != IMAGE_FILE_MACHINE_UNKNOWN && config->machine != mt) {
	error(toString(file) + ": machine type " + machineToStr(mt) +
	" conflicts with " + machineToStr(config->machine));
	return;
	}

	if (auto *f = dyn_cast<ObjFile>(file)) {
	ObjFile::instances.push_back(f);
	} else if (auto *f = dyn_cast<BitcodeFile>(file)) {
	BitcodeFile::instances.push_back(f);
	} else if (auto *f = dyn_cast<ImportFile>(file)) {
	ImportFile::instances.push_back(f);
	}

	driver->parseDirectives(file);
	}

	static void errorOrWarn(const Twine &s) {
	if (config->forceUnresolved)
	warn(s);
	else
	error(s);
	}

	// Returns the symbol in SC whose value is <= Addr that is closest to Addr.
	// This is generally the global variable or function whose definition contains
	// Addr.
	static Symbol getSymbol(SectionChunk sc, uint32_t addr) {
	DefinedRegular *candidate = nullptr;

	for (Symbol *s : sc->file->getSymbols()) {
	auto *d = dyn_cast_or_null<DefinedRegular>(s);
	if (!d \|\| !d->data \|\| d->getChunk() != sc \|\| d->getValue() > addr \|\|
	(candidate && d->getValue() < candidate->getValue()))
	continue;

	candidate = d;
	}

	return candidate;
	}

	// Given a file and the index of a symbol in that file, returns a description
	// of all references to that symbol from that file. If no debug information is
	// available, returns just the name of the file, else one string per actual
	// reference as described in the debug info.
	std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex) {
	struct Location {
	Symbol *sym;
	std::pair<StringRef, uint32_t> fileLine;
	};
	std::vector<Location> locations;

	for (Chunk *c : file->getChunks()) {
	auto *sc = dyn_cast<SectionChunk>(c);
	if (!sc)
	continue;
	for (const coff_relocation &r : sc->getRelocs()) {
	if (r.SymbolTableIndex != symIndex)
	continue;
	std::pair<StringRef, uint32_t> fileLine =
	getFileLine(sc, r.VirtualAddress);
	Symbol *sym = getSymbol(sc, r.VirtualAddress);
	if (!fileLine.first.empty() \|\| sym)
	locations.push_back({sym, fileLine});
	}
	}

	if (locations.empty())
	return std::vector<std::string>({"\n>>> referenced by " + toString(file)});

	std::vector<std::string> symbolLocations(locations.size());
	size_t i = 0;
	for (Location loc : locations) {
	llvm::raw_string_ostream os(symbolLocations[i++]);
	os << "\n>>> referenced by ";
	if (!loc.fileLine.first.empty())
	os << loc.fileLine.first << ":" << loc.fileLine.second
	<< "\n>>> ";
	os << toString(file);
	if (loc.sym)
	os << ":(" << toString(*loc.sym) << ')';
	}
	return symbolLocations;
	}

	// For an undefined symbol, stores all files referencing it and the index of
	// the undefined symbol in each file.
	struct UndefinedDiag {
	Symbol *sym;
	struct File {
	ObjFile *oFile;
	uint64_t symIndex;
	};
	std::vector<File> files;
	};

	static void reportUndefinedSymbol(const UndefinedDiag &undefDiag) {
	std::string out;
	llvm::raw_string_ostream os(out);
	os << "undefined symbol: " << toString(*undefDiag.sym);

	const size_t maxUndefReferences = 10;
	size_t i = 0, numRefs = 0;
	for (const UndefinedDiag::File &ref : undefDiag.files) {
	std::vector<std::string> symbolLocations =
	getSymbolLocations(ref.oFile, ref.symIndex);
	numRefs += symbolLocations.size();
	for (const std::string &s : symbolLocations) {
	if (i >= maxUndefReferences)
	break;
	os << s;
	i++;
	}
	}
	if (i < numRefs)
	os << "\n>>> referenced " << numRefs - i << " more times";
	errorOrWarn(os.str());
	}

	void SymbolTable::loadMinGWAutomaticImports() {
	for (auto &i : symMap) {
	Symbol *sym = i.second;
	auto *undef = dyn_cast<Undefined>(sym);
	if (!undef)
	continue;
	if (!sym->isUsedInRegularObj)
	continue;

	StringRef name = undef->getName();

	if (name.startswith("__imp_"))
	continue;
	// If we have an undefined symbol, but we have a Lazy representing a
	// symbol we could load from file, make sure to load that.
	Lazy *l = dyn_cast_or_null<Lazy>(find(("__imp_" + name).str()));
	if (!l \|\| l->pendingArchiveLoad)
	continue;

	log("Loading lazy " + l->getName() + " from " + l->file->getName() +
	" for automatic import");
	l->pendingArchiveLoad = true;
	- l->file->addMember(&l->sym);
	+ l->file->addMember(l->sym);
	}
	}

	bool SymbolTable::handleMinGWAutomaticImport(Symbol *sym, StringRef name) {
	if (name.startswith("__imp_"))
	return false;
	Defined *imp = dyn_cast_or_null<Defined>(find(("__imp_" + name).str()));
	if (!imp)
	return false;

	// Replace the reference directly to a variable with a reference
	// to the import address table instead. This obviously isn't right,
	// but we mark the symbol as isRuntimePseudoReloc, and a later pass
	// will add runtime pseudo relocations for every relocation against
	// this Symbol. The runtime pseudo relocation framework expects the
	// reference itself to point at the IAT entry.
	size_t impSize = 0;
	if (isa<DefinedImportData>(imp)) {
	log("Automatically importing " + name + " from " +
	cast<DefinedImportData>(imp)->getDLLName());
	impSize = sizeof(DefinedImportData);
	} else if (isa<DefinedRegular>(imp)) {
	log("Automatically importing " + name + " from " +
	toString(cast<DefinedRegular>(imp)->file));
	impSize = sizeof(DefinedRegular);
	} else {
	warn("unable to automatically import " + name + " from " + imp->getName() +
	" from " + toString(cast<DefinedRegular>(imp)->file) +
	"; unexpected symbol type");
	return false;
	}
	sym->replaceKeepingName(imp, impSize);
	sym->isRuntimePseudoReloc = true;

	// There may exist symbols named .refptr.<name> which only consist
	// of a single pointer to <name>. If it turns out <name> is
	// automatically imported, we don't need to keep the .refptr.<name>
	// pointer at all, but redirect all accesses to it to the IAT entry
	// for __imp_<name> instead, and drop the whole .refptr.<name> chunk.
	DefinedRegular *refptr =
	dyn_cast_or_null<DefinedRegular>(find((".refptr." + name).str()));
	if (refptr && refptr->getChunk()->getSize() == config->wordsize) {
	SectionChunk *sc = dyn_cast_or_null<SectionChunk>(refptr->getChunk());
	if (sc && sc->getRelocs().size() == 1 && *sc->symbols().begin() == sym) {
	log("Replacing .refptr." + name + " with " + imp->getName());
	refptr->getChunk()->live = false;
	refptr->replaceKeepingName(imp, impSize);
	}
	}
	return true;
	}

	void SymbolTable::reportRemainingUndefines() {
	SmallPtrSet<Symbol *, 8> undefs;
	DenseMap<Symbol , Symbol > localImports;

	for (auto &i : symMap) {
	Symbol *sym = i.second;
	auto *undef = dyn_cast<Undefined>(sym);
	if (!undef)
	continue;
	if (!sym->isUsedInRegularObj)
	continue;

	StringRef name = undef->getName();

	// A weak alias may have been resolved, so check for that.
	if (Defined *d = undef->getWeakAlias()) {
	// We want to replace Sym with D. However, we can't just blindly
	// copy sizeof(SymbolUnion) bytes from D to Sym because D may be an
	// internal symbol, and internal symbols are stored as "unparented"
	// Symbols. For that reason we need to check which type of symbol we
	// are dealing with and copy the correct number of bytes.
	if (isa<DefinedRegular>(d))
	memcpy(sym, d, sizeof(DefinedRegular));
	else if (isa<DefinedAbsolute>(d))
	memcpy(sym, d, sizeof(DefinedAbsolute));
	else
	memcpy(sym, d, sizeof(SymbolUnion));
	continue;
	}

	// If we can resolve a symbol by removing __imp_ prefix, do that.
	// This odd rule is for compatibility with MSVC linker.
	if (name.startswith("__imp_")) {
	Symbol *imp = find(name.substr(strlen("__imp_")));
	if (imp && isa<Defined>(imp)) {
	auto *d = cast<Defined>(imp);
	replaceSymbol<DefinedLocalImport>(sym, name, d);
	localImportChunks.push_back(cast<DefinedLocalImport>(sym)->getChunk());
	localImports[sym] = d;
	continue;
	}
	}

	// We don't want to report missing Microsoft precompiled headers symbols.
	// A proper message will be emitted instead in PDBLinker::aquirePrecompObj
	if (name.contains("_PchSym_"))
	continue;

	if (config->mingw && handleMinGWAutomaticImport(sym, name))
	continue;

	// Remaining undefined symbols are not fatal if /force is specified.
	// They are replaced with dummy defined symbols.
	if (config->forceUnresolved)
	replaceSymbol<DefinedAbsolute>(sym, name, 0);
	undefs.insert(sym);
	}

	if (undefs.empty() && localImports.empty())
	return;

	for (Symbol *b : config->gcroot) {
	if (undefs.count(b))
	errorOrWarn("<root>: undefined symbol: " + toString(*b));
	if (config->warnLocallyDefinedImported)
	if (Symbol *imp = localImports.lookup(b))
	warn("<root>: locally defined symbol imported: " + toString(*imp) +
	" (defined in " + toString(imp->getFile()) + ") [LNK4217]");
	}

	std::vector<UndefinedDiag> undefDiags;
	DenseMap<Symbol *, int> firstDiag;

	for (ObjFile *file : ObjFile::instances) {
	size_t symIndex = (size_t)-1;
	for (Symbol *sym : file->getSymbols()) {
	++symIndex;
	if (!sym)
	continue;
	if (undefs.count(sym)) {
	auto it = firstDiag.find(sym);
	if (it == firstDiag.end()) {
	firstDiag[sym] = undefDiags.size();
	undefDiags.push_back({sym, {{file, symIndex}}});
	} else {
	undefDiags[it->second].files.push_back({file, symIndex});
	}
	}
	if (config->warnLocallyDefinedImported)
	if (Symbol *imp = localImports.lookup(sym))
	warn(toString(file) +
	": locally defined symbol imported: " + toString(*imp) +
	" (defined in " + toString(imp->getFile()) + ") [LNK4217]");
	}
	}

	for (const UndefinedDiag& undefDiag : undefDiags)
	reportUndefinedSymbol(undefDiag);
	}

	std::pair<Symbol *, bool> SymbolTable::insert(StringRef name) {
	bool inserted = false;
	Symbol *&sym = symMap[CachedHashStringRef(name)];
	if (!sym) {
	sym = reinterpret_cast<Symbol *>(make<SymbolUnion>());
	sym->isUsedInRegularObj = false;
	sym->pendingArchiveLoad = false;
	inserted = true;
	}
	return {sym, inserted};
	}

	std::pair<Symbol , bool> SymbolTable::insert(StringRef name, InputFile file) {
	std::pair<Symbol *, bool> result = insert(name);
	if (!file \|\| !isa<BitcodeFile>(file))
	result.first->isUsedInRegularObj = true;
	return result;
	}

	Symbol SymbolTable::addUndefined(StringRef name, InputFile f,
	bool isWeakAlias) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(name, f);
	if (wasInserted \|\| (isa<Lazy>(s) && isWeakAlias)) {
	replaceSymbol<Undefined>(s, name);
	return s;
	}
	if (auto *l = dyn_cast<Lazy>(s)) {
	if (!s->pendingArchiveLoad) {
	s->pendingArchiveLoad = true;
	- l->file->addMember(&l->sym);
	+ l->file->addMember(l->sym);
	}
	}
	return s;
	}

	-void SymbolTable::addLazy(ArchiveFile *f, const Archive::Symbol sym) {
	+void SymbolTable::addLazy(ArchiveFile *f, const Archive::Symbol &sym) {
	StringRef name = sym.getName();
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(name);
	if (wasInserted) {
	replaceSymbol<Lazy>(s, f, sym);
	return;
	}
	auto *u = dyn_cast<Undefined>(s);
	if (!u \|\| u->weakAlias \|\| s->pendingArchiveLoad)
	return;
	s->pendingArchiveLoad = true;
	- f->addMember(&sym);
	+ f->addMember(sym);
	}

	void SymbolTable::reportDuplicate(Symbol existing, InputFile newFile) {
	std::string msg = "duplicate symbol: " + toString(*existing) + " in " +
	toString(existing->getFile()) + " and in " +
	toString(newFile);

	if (config->forceMultiple)
	warn(msg);
	else
	error(msg);
	}

	Symbol *SymbolTable::addAbsolute(StringRef n, COFFSymbolRef sym) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(n, nullptr);
	s->isUsedInRegularObj = true;
	if (wasInserted \|\| isa<Undefined>(s) \|\| isa<Lazy>(s))
	replaceSymbol<DefinedAbsolute>(s, n, sym);
	else if (!isa<DefinedCOFF>(s))
	reportDuplicate(s, nullptr);
	return s;
	}

	Symbol *SymbolTable::addAbsolute(StringRef n, uint64_t va) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(n, nullptr);
	s->isUsedInRegularObj = true;
	if (wasInserted \|\| isa<Undefined>(s) \|\| isa<Lazy>(s))
	replaceSymbol<DefinedAbsolute>(s, n, va);
	else if (!isa<DefinedCOFF>(s))
	reportDuplicate(s, nullptr);
	return s;
	}

	Symbol SymbolTable::addSynthetic(StringRef n, Chunk c) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(n, nullptr);
	s->isUsedInRegularObj = true;
	if (wasInserted \|\| isa<Undefined>(s) \|\| isa<Lazy>(s))
	replaceSymbol<DefinedSynthetic>(s, n, c);
	else if (!isa<DefinedCOFF>(s))
	reportDuplicate(s, nullptr);
	return s;
	}

	Symbol SymbolTable::addRegular(InputFile f, StringRef n,
	const coff_symbol_generic *sym,
	SectionChunk *c) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(n, f);
	if (wasInserted \|\| !isa<DefinedRegular>(s))
	replaceSymbol<DefinedRegular>(s, f, n, /IsCOMDAT/ false,
	/IsExternal/ true, sym, c);
	else
	reportDuplicate(s, f);
	return s;
	}

	std::pair<DefinedRegular *, bool>
	SymbolTable::addComdat(InputFile *f, StringRef n,
	const coff_symbol_generic *sym) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(n, f);
	if (wasInserted \|\| !isa<DefinedRegular>(s)) {
	replaceSymbol<DefinedRegular>(s, f, n, /IsCOMDAT/ true,
	/IsExternal/ true, sym, nullptr);
	return {cast<DefinedRegular>(s), true};
	}
	auto *existingSymbol = cast<DefinedRegular>(s);
	if (!existingSymbol->isCOMDAT)
	reportDuplicate(s, f);
	return {existingSymbol, false};
	}

	Symbol SymbolTable::addCommon(InputFile f, StringRef n, uint64_t size,
	const coff_symbol_generic sym, CommonChunk c) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(n, f);
	if (wasInserted \|\| !isa<DefinedCOFF>(s))
	replaceSymbol<DefinedCommon>(s, f, n, size, sym, c);
	else if (auto *dc = dyn_cast<DefinedCommon>(s))
	if (size > dc->getSize())
	replaceSymbol<DefinedCommon>(s, f, n, size, sym, c);
	return s;
	}

	Symbol SymbolTable::addImportData(StringRef n, ImportFile f) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(n, nullptr);
	s->isUsedInRegularObj = true;
	if (wasInserted \|\| isa<Undefined>(s) \|\| isa<Lazy>(s)) {
	replaceSymbol<DefinedImportData>(s, n, f);
	return s;
	}

	reportDuplicate(s, f);
	return nullptr;
	}

	Symbol SymbolTable::addImportThunk(StringRef name, DefinedImportData id,
	uint16_t machine) {
	Symbol *s;
	bool wasInserted;
	std::tie(s, wasInserted) = insert(name, nullptr);
	s->isUsedInRegularObj = true;
	if (wasInserted \|\| isa<Undefined>(s) \|\| isa<Lazy>(s)) {
	replaceSymbol<DefinedImportThunk>(s, name, id, machine);
	return s;
	}

	reportDuplicate(s, id->file);
	return nullptr;
	+}
	+
	+void SymbolTable::addLibcall(StringRef name) {
	+ Symbol *sym = findUnderscore(name);
	+ if (!sym)
	+ return;
	+
	+ if (Lazy *l = dyn_cast<Lazy>(sym)) {
	+ MemoryBufferRef mb = l->getMemberBuffer();
	+ if (identify_magic(mb.getBuffer()) == llvm::file_magic::bitcode)
	+ addUndefined(sym->getName());
	+ }
	}

	std::vector<Chunk *> SymbolTable::getChunks() {
	std::vector<Chunk *> res;
	for (ObjFile *file : ObjFile::instances) {
	ArrayRef<Chunk *> v = file->getChunks();
	res.insert(res.end(), v.begin(), v.end());
	}
	return res;
	}

	Symbol *SymbolTable::find(StringRef name) {
	return symMap.lookup(CachedHashStringRef(name));
	}

	Symbol *SymbolTable::findUnderscore(StringRef name) {
	if (config->machine == I386)
	return find(("_" + name).str());
	return find(name);
	}

	// Return all symbols that start with Prefix, possibly ignoring the first
	// character of Prefix or the first character symbol.
	std::vector<Symbol *> SymbolTable::getSymsWithPrefix(StringRef prefix) {
	std::vector<Symbol *> syms;
	for (auto pair : symMap) {
	StringRef name = pair.first.val();
	if (name.startswith(prefix) \|\| name.startswith(prefix.drop_front()) \|\|
	name.drop_front().startswith(prefix) \|\|
	name.drop_front().startswith(prefix.drop_front())) {
	syms.push_back(pair.second);
	}
	}
	return syms;
	}

	Symbol *SymbolTable::findMangle(StringRef name) {
	if (Symbol *sym = find(name))
	if (!isa<Undefined>(sym))
	return sym;

	// Efficient fuzzy string lookup is impossible with a hash table, so iterate
	// the symbol table once and collect all possibly matching symbols into this
	// vector. Then compare each possibly matching symbol with each possible
	// mangling.
	std::vector<Symbol *> syms = getSymsWithPrefix(name);
	auto findByPrefix = [&syms](const Twine &t) -> Symbol * {
	std::string prefix = t.str();
	for (auto *s : syms)
	if (s->getName().startswith(prefix))
	return s;
	return nullptr;
	};

	// For non-x86, just look for C++ functions.
	if (config->machine != I386)
	return findByPrefix("?" + name + "@@Y");

	if (!name.startswith("_"))
	return nullptr;
	// Search for x86 stdcall function.
	if (Symbol *s = findByPrefix(name + "@"))
	return s;
	// Search for x86 fastcall function.
	if (Symbol *s = findByPrefix("@" + name.substr(1) + "@"))
	return s;
	// Search for x86 vectorcall function.
	if (Symbol *s = findByPrefix(name.substr(1) + "@@"))
	return s;
	// Search for x86 C++ non-member function.
	return findByPrefix("?" + name.substr(1) + "@@Y");
	}

	Symbol *SymbolTable::addUndefined(StringRef name) {
	return addUndefined(name, nullptr, false);
	}

	std::vector<StringRef> SymbolTable::compileBitcodeFiles() {
	lto.reset(new BitcodeCompiler);
	for (BitcodeFile *f : BitcodeFile::instances)
	lto->add(*f);
	return lto->compile();
	}

	void SymbolTable::addCombinedLTOObjects() {
	if (BitcodeFile::instances.empty())
	return;

	ScopedTimer t(ltoTimer);
	for (StringRef object : compileBitcodeFiles()) {
	auto *obj = make<ObjFile>(MemoryBufferRef(object, "lto.tmp"));
	obj->parse();
	ObjFile::instances.push_back(obj);
	}
	}

	} // namespace coff
	} // namespace lld
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/SymbolTable.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/SymbolTable.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/SymbolTable.h (revision 351722)
	@@ -1,131 +1,132 @@
	//===- SymbolTable.h --------------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLD_COFF_SYMBOL_TABLE_H
	#define LLD_COFF_SYMBOL_TABLE_H

	#include "InputFiles.h"
	#include "LTO.h"
	#include "llvm/ADT/CachedHashString.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseMapInfo.h"
	#include "llvm/Support/raw_ostream.h"

	namespace llvm {
	struct LTOCodeGenerator;
	}

	namespace lld {
	namespace coff {

	class Chunk;
	class CommonChunk;
	class Defined;
	class DefinedAbsolute;
	class DefinedRegular;
	class DefinedRelative;
	class Lazy;
	class SectionChunk;
	class Symbol;

	// SymbolTable is a bucket of all known symbols, including defined,
	// undefined, or lazy symbols (the last one is symbols in archive
	// files whose archive members are not yet loaded).
	//
	// We put all symbols of all files to a SymbolTable, and the
	// SymbolTable selects the "best" symbols if there are name
	// conflicts. For example, obviously, a defined symbol is better than
	// an undefined symbol. Or, if there's a conflict between a lazy and a
	// undefined, it'll read an archive member to read a real definition
	// to replace the lazy symbol. The logic is implemented in the
	// add*() functions, which are called by input files as they are parsed.
	// There is one add* function per symbol type.
	class SymbolTable {
	public:
	void addFile(InputFile *file);

	// Try to resolve any undefined symbols and update the symbol table
	// accordingly, then print an error message for any remaining undefined
	// symbols.
	void reportRemainingUndefines();

	void loadMinGWAutomaticImports();
	bool handleMinGWAutomaticImport(Symbol *sym, StringRef name);

	// Returns a list of chunks of selected symbols.
	std::vector<Chunk *> getChunks();

	// Returns a symbol for a given name. Returns a nullptr if not found.
	Symbol *find(StringRef name);
	Symbol *findUnderscore(StringRef name);

	// Occasionally we have to resolve an undefined symbol to its
	// mangled symbol. This function tries to find a mangled name
	// for U from the symbol table, and if found, set the symbol as
	// a weak alias for U.
	Symbol *findMangle(StringRef name);

	// Build a set of COFF objects representing the combined contents of
	// BitcodeFiles and add them to the symbol table. Called after all files are
	// added and before the writer writes results to a file.
	void addCombinedLTOObjects();
	std::vector<StringRef> compileBitcodeFiles();

	// Creates an Undefined symbol for a given name.
	Symbol *addUndefined(StringRef name);

	Symbol addSynthetic(StringRef n, Chunk c);
	Symbol *addAbsolute(StringRef n, uint64_t va);

	Symbol addUndefined(StringRef name, InputFile f, bool isWeakAlias);
	- void addLazy(ArchiveFile *f, const Archive::Symbol sym);
	+ void addLazy(ArchiveFile *f, const Archive::Symbol &sym);
	Symbol *addAbsolute(StringRef n, COFFSymbolRef s);
	Symbol addRegular(InputFile f, StringRef n,
	const llvm::object::coff_symbol_generic *s = nullptr,
	SectionChunk *c = nullptr);
	std::pair<DefinedRegular *, bool>
	addComdat(InputFile *f, StringRef n,
	const llvm::object::coff_symbol_generic *s = nullptr);
	Symbol addCommon(InputFile f, StringRef n, uint64_t size,
	const llvm::object::coff_symbol_generic *s = nullptr,
	CommonChunk *c = nullptr);
	Symbol addImportData(StringRef n, ImportFile f);
	Symbol addImportThunk(StringRef name, DefinedImportData s,
	uint16_t machine);
	+ void addLibcall(StringRef name);

	void reportDuplicate(Symbol existing, InputFile newFile);

	// A list of chunks which to be added to .rdata.
	std::vector<Chunk *> localImportChunks;

	// Iterates symbols in non-determinstic hash table order.
	template <typename T> void forEachSymbol(T callback) {
	for (auto &pair : symMap)
	callback(pair.second);
	}

	private:
	/// Inserts symbol if not already present.
	std::pair<Symbol *, bool> insert(StringRef name);
	/// Same as insert(Name), but also sets isUsedInRegularObj.
	std::pair<Symbol , bool> insert(StringRef name, InputFile f);

	std::vector<Symbol *> getSymsWithPrefix(StringRef prefix);

	llvm::DenseMap<llvm::CachedHashStringRef, Symbol *> symMap;
	std::unique_ptr<BitcodeCompiler> lto;
	};

	extern SymbolTable *symtab;

	std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex);

	} // namespace coff
	} // namespace lld

	#endif
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/Symbols.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/Symbols.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/Symbols.cpp (revision 351722)
	@@ -1,117 +1,131 @@
	//===- Symbols.cpp --------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "Symbols.h"
	#include "InputFiles.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "lld/Common/Strings.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;
	using namespace llvm::object;

	using namespace lld::coff;

	+namespace lld {
	+
	static_assert(sizeof(SymbolUnion) <= 48,
	"symbols should be optimized for memory usage");

	// Returns a symbol name for an error message.
	-std::string lld::toString(coff::Symbol &b) {
	+static std::string demangle(StringRef symName) {
	if (config->demangle)
	- if (Optional<std::string> s = lld::demangleMSVC(b.getName()))
	+ if (Optional<std::string> s = demangleMSVC(symName))
	return *s;
	- return b.getName();
	+ return symName;
	}
	+std::string toString(coff::Symbol &b) { return demangle(b.getName()); }
	+std::string toCOFFString(const Archive::Symbol &b) {
	+ return demangle(b.getName());
	+}

	-namespace lld {
	namespace coff {

	StringRef Symbol::getName() {
	// COFF symbol names are read lazily for a performance reason.
	// Non-external symbol names are never used by the linker except for logging
	// or debugging. Their internal references are resolved not by name but by
	// symbol index. And because they are not external, no one can refer them by
	// name. Object files contain lots of non-external symbols, and creating
	// StringRefs for them (which involves lots of strlen() on the string table)
	// is a waste of time.
	if (nameData == nullptr) {
	auto *d = cast<DefinedCOFF>(this);
	StringRef nameStr;
	cast<ObjFile>(d->file)->getCOFFObj()->getSymbolName(d->sym, nameStr);
	nameData = nameStr.data();
	nameSize = nameStr.size();
	assert(nameSize == nameStr.size() && "name length truncated");
	}
	return StringRef(nameData, nameSize);
	}

	InputFile *Symbol::getFile() {
	if (auto *sym = dyn_cast<DefinedCOFF>(this))
	return sym->file;
	if (auto *sym = dyn_cast<Lazy>(this))
	return sym->file;
	return nullptr;
	}

	bool Symbol::isLive() const {
	if (auto *r = dyn_cast<DefinedRegular>(this))
	return r->getChunk()->live;
	if (auto *imp = dyn_cast<DefinedImportData>(this))
	return imp->file->live;
	if (auto *imp = dyn_cast<DefinedImportThunk>(this))
	return imp->wrappedSym->file->thunkLive;
	// Assume any other kind of symbol is live.
	return true;
	}

	// MinGW specific.
	void Symbol::replaceKeepingName(Symbol *other, size_t size) {
	StringRef origName = getName();
	memcpy(this, other, size);
	nameData = origName.data();
	nameSize = origName.size();
	}

	COFFSymbolRef DefinedCOFF::getCOFFSymbol() {
	size_t symSize = cast<ObjFile>(file)->getCOFFObj()->getSymbolTableEntrySize();
	if (symSize == sizeof(coff_symbol16))
	return COFFSymbolRef(reinterpret_cast<const coff_symbol16 *>(sym));
	assert(symSize == sizeof(coff_symbol32));
	return COFFSymbolRef(reinterpret_cast<const coff_symbol32 *>(sym));
	}

	uint16_t DefinedAbsolute::numOutputSections;

	static Chunk makeImportThunk(DefinedImportData s, uint16_t machine) {
	if (machine == AMD64)
	return make<ImportThunkChunkX64>(s);
	if (machine == I386)
	return make<ImportThunkChunkX86>(s);
	if (machine == ARM64)
	return make<ImportThunkChunkARM64>(s);
	assert(machine == ARMNT);
	return make<ImportThunkChunkARM>(s);
	}

	DefinedImportThunk::DefinedImportThunk(StringRef name, DefinedImportData *s,
	uint16_t machine)
	: Defined(DefinedImportThunkKind, name), wrappedSym(s),
	data(makeImportThunk(s, machine)) {}

	Defined *Undefined::getWeakAlias() {
	// A weak alias may be a weak alias to another symbol, so check recursively.
	for (Symbol *a = weakAlias; a; a = cast<Undefined>(a)->weakAlias)
	if (auto *d = dyn_cast<Defined>(a))
	return d;
	return nullptr;
	+}
	+
	+MemoryBufferRef Lazy::getMemberBuffer() {
	+ Archive::Child c =
	+ CHECK(sym.getMember(),
	+ "could not get the member for symbol " + toCOFFString(sym));
	+ return CHECK(c.getMemoryBufferRef(),
	+ "could not get the buffer for the member defining symbol " +
	+ toCOFFString(sym));
	}
	} // namespace coff
	} // namespace lld
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/Symbols.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/Symbols.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/Symbols.h (revision 351722)
	@@ -1,435 +1,444 @@
	//===- Symbols.h ------------------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLD_COFF_SYMBOLS_H
	#define LLD_COFF_SYMBOLS_H

	#include "Chunks.h"
	#include "Config.h"
	#include "lld/Common/LLVM.h"
	#include "lld/Common/Memory.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/Object/Archive.h"
	#include "llvm/Object/COFF.h"
	#include <atomic>
	#include <memory>
	#include <vector>

	namespace lld {
	+
	+std::string toString(coff::Symbol &b);
	+
	+// There are two different ways to convert an Archive::Symbol to a string:
	+// One for Microsoft name mangling and one for Itanium name mangling.
	+// Call the functions toCOFFString and toELFString, not just toString.
	+std::string toCOFFString(const coff::Archive::Symbol &b);
	+
	namespace coff {

	using llvm::object::Archive;
	using llvm::object::COFFSymbolRef;
	using llvm::object::coff_import_header;
	using llvm::object::coff_symbol_generic;

	class ArchiveFile;
	class InputFile;
	class ObjFile;
	class SymbolTable;

	// The base class for real symbol classes.
	class Symbol {
	public:
	enum Kind {
	// The order of these is significant. We start with the regular defined
	// symbols as those are the most prevalent and the zero tag is the cheapest
	// to set. Among the defined kinds, the lower the kind is preferred over
	// the higher kind when testing whether one symbol should take precedence
	// over another.
	DefinedRegularKind = 0,
	DefinedCommonKind,
	DefinedLocalImportKind,
	DefinedImportThunkKind,
	DefinedImportDataKind,
	DefinedAbsoluteKind,
	DefinedSyntheticKind,

	UndefinedKind,
	LazyKind,

	LastDefinedCOFFKind = DefinedCommonKind,
	LastDefinedKind = DefinedSyntheticKind,
	};

	Kind kind() const { return static_cast<Kind>(symbolKind); }

	// Returns the symbol name.
	StringRef getName();

	void replaceKeepingName(Symbol *other, size_t size);

	// Returns the file from which this symbol was created.
	InputFile *getFile();

	// Indicates that this symbol will be included in the final image. Only valid
	// after calling markLive.
	bool isLive() const;

	protected:
	friend SymbolTable;
	explicit Symbol(Kind k, StringRef n = "")
	: symbolKind(k), isExternal(true), isCOMDAT(false),
	writtenToSymtab(false), pendingArchiveLoad(false), isGCRoot(false),
	isRuntimePseudoReloc(false), nameSize(n.size()),
	nameData(n.empty() ? nullptr : n.data()) {}

	const unsigned symbolKind : 8;
	unsigned isExternal : 1;

	public:
	// This bit is used by the \c DefinedRegular subclass.
	unsigned isCOMDAT : 1;

	// This bit is used by Writer::createSymbolAndStringTable() to prevent
	// symbols from being written to the symbol table more than once.
	unsigned writtenToSymtab : 1;

	// True if this symbol was referenced by a regular (non-bitcode) object.
	unsigned isUsedInRegularObj : 1;

	// True if we've seen both a lazy and an undefined symbol with this symbol
	// name, which means that we have enqueued an archive member load and should
	// not load any more archive members to resolve the same symbol.
	unsigned pendingArchiveLoad : 1;

	/// True if we've already added this symbol to the list of GC roots.
	unsigned isGCRoot : 1;

	unsigned isRuntimePseudoReloc : 1;

	protected:
	// Symbol name length. Assume symbol lengths fit in a 32-bit integer.
	uint32_t nameSize;

	const char *nameData;
	};

	// The base class for any defined symbols, including absolute symbols,
	// etc.
	class Defined : public Symbol {
	public:
	Defined(Kind k, StringRef n) : Symbol(k, n) {}

	static bool classof(const Symbol *s) { return s->kind() <= LastDefinedKind; }

	// Returns the RVA (relative virtual address) of this symbol. The
	// writer sets and uses RVAs.
	uint64_t getRVA();

	// Returns the chunk containing this symbol. Absolute symbols and __ImageBase
	// do not have chunks, so this may return null.
	Chunk *getChunk();
	};

	// Symbols defined via a COFF object file or bitcode file. For COFF files, this
	// stores a coff_symbol_generic*, and names of internal symbols are lazily
	// loaded through that. For bitcode files, Sym is nullptr and the name is stored
	// as a decomposed StringRef.
	class DefinedCOFF : public Defined {
	friend Symbol;

	public:
	DefinedCOFF(Kind k, InputFile f, StringRef n, const coff_symbol_generic s)
	: Defined(k, n), file(f), sym(s) {}

	static bool classof(const Symbol *s) {
	return s->kind() <= LastDefinedCOFFKind;
	}

	InputFile *getFile() { return file; }

	COFFSymbolRef getCOFFSymbol();

	InputFile *file;

	protected:
	const coff_symbol_generic *sym;
	};

	// Regular defined symbols read from object file symbol tables.
	class DefinedRegular : public DefinedCOFF {
	public:
	DefinedRegular(InputFile *f, StringRef n, bool isCOMDAT,
	bool isExternal = false,
	const coff_symbol_generic *s = nullptr,
	SectionChunk *c = nullptr)
	: DefinedCOFF(DefinedRegularKind, f, n, s), data(c ? &c->repl : nullptr) {
	this->isExternal = isExternal;
	this->isCOMDAT = isCOMDAT;
	}

	static bool classof(const Symbol *s) {
	return s->kind() == DefinedRegularKind;
	}

	uint64_t getRVA() const { return (*data)->getRVA() + sym->Value; }
	SectionChunk getChunk() const { return data; }
	uint32_t getValue() const { return sym->Value; }

	SectionChunk **data;
	};

	class DefinedCommon : public DefinedCOFF {
	public:
	DefinedCommon(InputFile *f, StringRef n, uint64_t size,
	const coff_symbol_generic *s = nullptr,
	CommonChunk *c = nullptr)
	: DefinedCOFF(DefinedCommonKind, f, n, s), data(c), size(size) {
	this->isExternal = true;
	}

	static bool classof(const Symbol *s) {
	return s->kind() == DefinedCommonKind;
	}

	uint64_t getRVA() { return data->getRVA(); }
	CommonChunk *getChunk() { return data; }

	private:
	friend SymbolTable;
	uint64_t getSize() const { return size; }
	CommonChunk *data;
	uint64_t size;
	};

	// Absolute symbols.
	class DefinedAbsolute : public Defined {
	public:
	DefinedAbsolute(StringRef n, COFFSymbolRef s)
	: Defined(DefinedAbsoluteKind, n), va(s.getValue()) {
	isExternal = s.isExternal();
	}

	DefinedAbsolute(StringRef n, uint64_t v)
	: Defined(DefinedAbsoluteKind, n), va(v) {}

	static bool classof(const Symbol *s) {
	return s->kind() == DefinedAbsoluteKind;
	}

	uint64_t getRVA() { return va - config->imageBase; }
	void setVA(uint64_t v) { va = v; }

	// Section index relocations against absolute symbols resolve to
	// this 16 bit number, and it is the largest valid section index
	// plus one. This variable keeps it.
	static uint16_t numOutputSections;

	private:
	uint64_t va;
	};

	// This symbol is used for linker-synthesized symbols like __ImageBase and
	// __safe_se_handler_table.
	class DefinedSynthetic : public Defined {
	public:
	explicit DefinedSynthetic(StringRef name, Chunk *c)
	: Defined(DefinedSyntheticKind, name), c(c) {}

	static bool classof(const Symbol *s) {
	return s->kind() == DefinedSyntheticKind;
	}

	// A null chunk indicates that this is __ImageBase. Otherwise, this is some
	// other synthesized chunk, like SEHTableChunk.
	uint32_t getRVA() { return c ? c->getRVA() : 0; }
	Chunk *getChunk() { return c; }

	private:
	Chunk *c;
	};

	// This class represents a symbol defined in an archive file. It is
	// created from an archive file header, and it knows how to load an
	// object file from an archive to replace itself with a defined
	// symbol. If the resolver finds both Undefined and Lazy for
	// the same name, it will ask the Lazy to load a file.
	class Lazy : public Symbol {
	public:
	Lazy(ArchiveFile *f, const Archive::Symbol s)
	: Symbol(LazyKind, s.getName()), file(f), sym(s) {}

	static bool classof(const Symbol *s) { return s->kind() == LazyKind; }

	+ MemoryBufferRef getMemberBuffer();
	+
	ArchiveFile *file;

	private:
	friend SymbolTable;

	private:
	const Archive::Symbol sym;
	};

	// Undefined symbols.
	class Undefined : public Symbol {
	public:
	explicit Undefined(StringRef n) : Symbol(UndefinedKind, n) {}

	static bool classof(const Symbol *s) { return s->kind() == UndefinedKind; }

	// An undefined symbol can have a fallback symbol which gives an
	// undefined symbol a second chance if it would remain undefined.
	// If it remains undefined, it'll be replaced with whatever the
	// Alias pointer points to.
	Symbol *weakAlias = nullptr;

	// If this symbol is external weak, try to resolve it to a defined
	// symbol by searching the chain of fallback symbols. Returns the symbol if
	// successful, otherwise returns null.
	Defined *getWeakAlias();
	};

	// Windows-specific classes.

	// This class represents a symbol imported from a DLL. This has two
	// names for internal use and external use. The former is used for
	// name resolution, and the latter is used for the import descriptor
	// table in an output. The former has "__imp_" prefix.
	class DefinedImportData : public Defined {
	public:
	DefinedImportData(StringRef n, ImportFile *f)
	: Defined(DefinedImportDataKind, n), file(f) {
	}

	static bool classof(const Symbol *s) {
	return s->kind() == DefinedImportDataKind;
	}

	uint64_t getRVA() { return file->location->getRVA(); }
	Chunk *getChunk() { return file->location; }
	void setLocation(Chunk *addressTable) { file->location = addressTable; }

	StringRef getDLLName() { return file->dllName; }
	StringRef getExternalName() { return file->externalName; }
	uint16_t getOrdinal() { return file->hdr->OrdinalHint; }

	ImportFile *file;
	};

	// This class represents a symbol for a jump table entry which jumps
	// to a function in a DLL. Linker are supposed to create such symbols
	// without "__imp_" prefix for all function symbols exported from
	// DLLs, so that you can call DLL functions as regular functions with
	// a regular name. A function pointer is given as a DefinedImportData.
	class DefinedImportThunk : public Defined {
	public:
	DefinedImportThunk(StringRef name, DefinedImportData *s, uint16_t machine);

	static bool classof(const Symbol *s) {
	return s->kind() == DefinedImportThunkKind;
	}

	uint64_t getRVA() { return data->getRVA(); }
	Chunk *getChunk() { return data; }

	DefinedImportData *wrappedSym;

	private:
	Chunk *data;
	};

	// If you have a symbol "foo" in your object file, a symbol name
	// "__imp_foo" becomes automatically available as a pointer to "foo".
	// This class is for such automatically-created symbols.
	// Yes, this is an odd feature. We didn't intend to implement that.
	// This is here just for compatibility with MSVC.
	class DefinedLocalImport : public Defined {
	public:
	DefinedLocalImport(StringRef n, Defined *s)
	: Defined(DefinedLocalImportKind, n), data(make<LocalImportChunk>(s)) {}

	static bool classof(const Symbol *s) {
	return s->kind() == DefinedLocalImportKind;
	}

	uint64_t getRVA() { return data->getRVA(); }
	Chunk *getChunk() { return data; }

	private:
	LocalImportChunk *data;
	};

	inline uint64_t Defined::getRVA() {
	switch (kind()) {
	case DefinedAbsoluteKind:
	return cast<DefinedAbsolute>(this)->getRVA();
	case DefinedSyntheticKind:
	return cast<DefinedSynthetic>(this)->getRVA();
	case DefinedImportDataKind:
	return cast<DefinedImportData>(this)->getRVA();
	case DefinedImportThunkKind:
	return cast<DefinedImportThunk>(this)->getRVA();
	case DefinedLocalImportKind:
	return cast<DefinedLocalImport>(this)->getRVA();
	case DefinedCommonKind:
	return cast<DefinedCommon>(this)->getRVA();
	case DefinedRegularKind:
	return cast<DefinedRegular>(this)->getRVA();
	case LazyKind:
	case UndefinedKind:
	llvm_unreachable("Cannot get the address for an undefined symbol.");
	}
	llvm_unreachable("unknown symbol kind");
	}

	inline Chunk *Defined::getChunk() {
	switch (kind()) {
	case DefinedRegularKind:
	return cast<DefinedRegular>(this)->getChunk();
	case DefinedAbsoluteKind:
	return nullptr;
	case DefinedSyntheticKind:
	return cast<DefinedSynthetic>(this)->getChunk();
	case DefinedImportDataKind:
	return cast<DefinedImportData>(this)->getChunk();
	case DefinedImportThunkKind:
	return cast<DefinedImportThunk>(this)->getChunk();
	case DefinedLocalImportKind:
	return cast<DefinedLocalImport>(this)->getChunk();
	case DefinedCommonKind:
	return cast<DefinedCommon>(this)->getChunk();
	case LazyKind:
	case UndefinedKind:
	llvm_unreachable("Cannot get the chunk of an undefined symbol.");
	}
	llvm_unreachable("unknown symbol kind");
	}

	// A buffer class that is large enough to hold any Symbol-derived
	// object. We allocate memory using this class and instantiate a symbol
	// using the placement new.
	union SymbolUnion {
	alignas(DefinedRegular) char a[sizeof(DefinedRegular)];
	alignas(DefinedCommon) char b[sizeof(DefinedCommon)];
	alignas(DefinedAbsolute) char c[sizeof(DefinedAbsolute)];
	alignas(DefinedSynthetic) char d[sizeof(DefinedSynthetic)];
	alignas(Lazy) char e[sizeof(Lazy)];
	alignas(Undefined) char f[sizeof(Undefined)];
	alignas(DefinedImportData) char g[sizeof(DefinedImportData)];
	alignas(DefinedImportThunk) char h[sizeof(DefinedImportThunk)];
	alignas(DefinedLocalImport) char i[sizeof(DefinedLocalImport)];
	};

	template <typename T, typename... ArgT>
	void replaceSymbol(Symbol *s, ArgT &&... arg) {
	static_assert(std::is_trivially_destructible<T>(),
	"Symbol types must be trivially destructible");
	static_assert(sizeof(T) <= sizeof(SymbolUnion), "Symbol too small");
	static_assert(alignof(T) <= alignof(SymbolUnion),
	"SymbolUnion not aligned enough");
	assert(static_cast<Symbol >(static_cast<T >(nullptr)) == nullptr &&
	"Not a Symbol");
	new (s) T(std::forward<ArgT>(arg)...);
	}
	} // namespace coff

	-std::string toString(coff::Symbol &b);
	} // namespace lld

	#endif
	Index: projects/clang900-import/contrib/llvm/tools/lld/COFF/Writer.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/COFF/Writer.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/COFF/Writer.cpp (revision 351722)
	@@ -1,1927 +1,1932 @@
	//===- Writer.cpp ---------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "Writer.h"
	#include "Config.h"
	#include "DLL.h"
	#include "InputFiles.h"
	#include "MapFile.h"
	#include "PDB.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "lld/Common/Threads.h"
	#include "lld/Common/Timer.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/BinaryStreamReader.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/Endian.h"
	#include "llvm/Support/FileOutputBuffer.h"
	#include "llvm/Support/Parallel.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/RandomNumberGenerator.h"
	#include "llvm/Support/xxhash.h"
	#include <algorithm>
	#include <cstdio>
	#include <map>
	#include <memory>
	#include <utility>

	using namespace llvm;
	using namespace llvm::COFF;
	using namespace llvm::object;
	using namespace llvm::support;
	using namespace llvm::support::endian;
	using namespace lld;
	using namespace lld::coff;

	/* To re-generate DOSProgram:
	$ cat > /tmp/DOSProgram.asm
	org 0
	; Copy cs to ds.
	push cs
	pop ds
	; Point ds:dx at the $-terminated string.
	mov dx, str
	; Int 21/AH=09h: Write string to standard output.
	mov ah, 0x9
	int 0x21
	; Int 21/AH=4Ch: Exit with return code (in AL).
	mov ax, 0x4C01
	int 0x21
	str:
	db 'This program cannot be run in DOS mode.$'
	align 8, db 0
	$ nasm -fbin /tmp/DOSProgram.asm -o /tmp/DOSProgram.bin
	$ xxd -i /tmp/DOSProgram.bin
	*/
	static unsigned char dosProgram[] = {
	0x0e, 0x1f, 0xba, 0x0e, 0x00, 0xb4, 0x09, 0xcd, 0x21, 0xb8, 0x01, 0x4c,
	0xcd, 0x21, 0x54, 0x68, 0x69, 0x73, 0x20, 0x70, 0x72, 0x6f, 0x67, 0x72,
	0x61, 0x6d, 0x20, 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x20, 0x62, 0x65,
	0x20, 0x72, 0x75, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x44, 0x4f, 0x53, 0x20,
	0x6d, 0x6f, 0x64, 0x65, 0x2e, 0x24, 0x00, 0x00
	};
	static_assert(sizeof(dosProgram) % 8 == 0,
	"DOSProgram size must be multiple of 8");

	static const int dosStubSize = sizeof(dos_header) + sizeof(dosProgram);
	static_assert(dosStubSize % 8 == 0, "DOSStub size must be multiple of 8");

	static const int numberOfDataDirectory = 16;

	// Global vector of all output sections. After output sections are finalized,
	// this can be indexed by Chunk::getOutputSection.
	static std::vector<OutputSection *> outputSections;

	OutputSection *Chunk::getOutputSection() const {
	return osidx == 0 ? nullptr : outputSections[osidx - 1];
	}

	namespace {

	class DebugDirectoryChunk : public NonSectionChunk {
	public:
	DebugDirectoryChunk(const std::vector<Chunk *> &r, bool writeRepro)
	: records(r), writeRepro(writeRepro) {}

	size_t getSize() const override {
	return (records.size() + int(writeRepro)) * sizeof(debug_directory);
	}

	void writeTo(uint8_t *b) const override {
	auto d = reinterpret_cast<debug_directory >(b);

	for (const Chunk *record : records) {
	OutputSection *os = record->getOutputSection();
	uint64_t offs = os->getFileOff() + (record->getRVA() - os->getRVA());
	fillEntry(d, COFF::IMAGE_DEBUG_TYPE_CODEVIEW, record->getSize(),
	record->getRVA(), offs);
	++d;
	}

	if (writeRepro) {
	// FIXME: The COFF spec allows either a 0-sized entry to just say
	// "the timestamp field is really a hash", or a 4-byte size field
	// followed by that many bytes containing a longer hash (with the
	// lowest 4 bytes usually being the timestamp in little-endian order).
	// Consider storing the full 8 bytes computed by xxHash64 here.
	fillEntry(d, COFF::IMAGE_DEBUG_TYPE_REPRO, 0, 0, 0);
	}
	}

	void setTimeDateStamp(uint32_t timeDateStamp) {
	for (support::ulittle32_t *tds : timeDateStamps)
	*tds = timeDateStamp;
	}

	private:
	void fillEntry(debug_directory *d, COFF::DebugType debugType, size_t size,
	uint64_t rva, uint64_t offs) const {
	d->Characteristics = 0;
	d->TimeDateStamp = 0;
	d->MajorVersion = 0;
	d->MinorVersion = 0;
	d->Type = debugType;
	d->SizeOfData = size;
	d->AddressOfRawData = rva;
	d->PointerToRawData = offs;

	timeDateStamps.push_back(&d->TimeDateStamp);
	}

	mutable std::vector<support::ulittle32_t *> timeDateStamps;
	const std::vector<Chunk *> &records;
	bool writeRepro;
	};

	class CVDebugRecordChunk : public NonSectionChunk {
	public:
	size_t getSize() const override {
	return sizeof(codeview::DebugInfo) + config->pdbAltPath.size() + 1;
	}

	void writeTo(uint8_t *b) const override {
	// Save off the DebugInfo entry to backfill the file signature (build id)
	// in Writer::writeBuildId
	buildId = reinterpret_cast<codeview::DebugInfo *>(b);

	// variable sized field (PDB Path)
	char p = reinterpret_cast<char >(b + sizeof(*buildId));
	if (!config->pdbAltPath.empty())
	memcpy(p, config->pdbAltPath.data(), config->pdbAltPath.size());
	p[config->pdbAltPath.size()] = '\0';
	}

	mutable codeview::DebugInfo *buildId = nullptr;
	};

	// PartialSection represents a group of chunks that contribute to an
	// OutputSection. Collating a collection of PartialSections of same name and
	// characteristics constitutes the OutputSection.
	class PartialSectionKey {
	public:
	StringRef name;
	unsigned characteristics;

	bool operator<(const PartialSectionKey &other) const {
	int c = name.compare(other.name);
	if (c == 1)
	return false;
	if (c == 0)
	return characteristics < other.characteristics;
	return true;
	}
	};

	// The writer writes a SymbolTable result to a file.
	class Writer {
	public:
	Writer() : buffer(errorHandler().outputBuffer) {}
	void run();

	private:
	void createSections();
	void createMiscChunks();
	void createImportTables();
	void appendImportThunks();
	void locateImportTables();
	void createExportTable();
	void mergeSections();
	void removeUnusedSections();
	void assignAddresses();
	void finalizeAddresses();
	void removeEmptySections();
	void assignOutputSectionIndices();
	void createSymbolAndStringTable();
	void openFile(StringRef outputPath);
	template <typename PEHeaderTy> void writeHeader();
	void createSEHTable();
	void createRuntimePseudoRelocs();
	void insertCtorDtorSymbols();
	void createGuardCFTables();
	void markSymbolsForRVATable(ObjFile *file,
	ArrayRef<SectionChunk *> symIdxChunks,
	SymbolRVASet &tableSymbols);
	void maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
	StringRef countSym);
	void setSectionPermissions();
	void writeSections();
	void writeBuildId();
	void sortExceptionTable();
	void sortCRTSectionChunks(std::vector<Chunk *> &chunks);
	void addSyntheticIdata();
	void fixPartialSectionChars(StringRef name, uint32_t chars);
	bool fixGnuImportChunks();
	PartialSection *createPartialSection(StringRef name, uint32_t outChars);
	PartialSection *findPartialSection(StringRef name, uint32_t outChars);

	llvm::Optional<coff_symbol16> createSymbol(Defined *d);
	size_t addEntryToStringTable(StringRef str);

	OutputSection *findSection(StringRef name);
	void addBaserels();
	void addBaserelBlocks(std::vector<Baserel> &v);

	uint32_t getSizeOfInitializedData();

	std::unique_ptr<FileOutputBuffer> &buffer;
	std::map<PartialSectionKey, PartialSection *> partialSections;
	std::vector<char> strtab;
	std::vector<llvm::object::coff_symbol16> outputSymtab;
	IdataContents idata;
	Chunk *importTableStart = nullptr;
	uint64_t importTableSize = 0;
	Chunk *iatStart = nullptr;
	uint64_t iatSize = 0;
	DelayLoadContents delayIdata;
	EdataContents edata;
	bool setNoSEHCharacteristic = false;

	DebugDirectoryChunk *debugDirectory = nullptr;
	std::vector<Chunk *> debugRecords;
	CVDebugRecordChunk *buildId = nullptr;
	ArrayRef<uint8_t> sectionTable;

	uint64_t fileSize;
	uint32_t pointerToSymbolTable = 0;
	uint64_t sizeOfImage;
	uint64_t sizeOfHeaders;

	OutputSection *textSec;
	OutputSection *rdataSec;
	OutputSection *buildidSec;
	OutputSection *dataSec;
	OutputSection *pdataSec;
	OutputSection *idataSec;
	OutputSection *edataSec;
	OutputSection *didatSec;
	OutputSection *rsrcSec;
	OutputSection *relocSec;
	OutputSection *ctorsSec;
	OutputSection *dtorsSec;

	// The first and last .pdata sections in the output file.
	//
	// We need to keep track of the location of .pdata in whichever section it
	// gets merged into so that we can sort its contents and emit a correct data
	// directory entry for the exception table. This is also the case for some
	// other sections (such as .edata) but because the contents of those sections
	// are entirely linker-generated we can keep track of their locations using
	// the chunks that the linker creates. All .pdata chunks come from input
	// files, so we need to keep track of them separately.
	Chunk *firstPdata = nullptr;
	Chunk *lastPdata;
	};
	} // anonymous namespace

	namespace lld {
	namespace coff {

	static Timer codeLayoutTimer("Code Layout", Timer::root());
	static Timer diskCommitTimer("Commit Output File", Timer::root());

	void writeResult() { Writer().run(); }

	void OutputSection::addChunk(Chunk *c) {
	chunks.push_back(c);
	}

	void OutputSection::insertChunkAtStart(Chunk *c) {
	chunks.insert(chunks.begin(), c);
	}

	void OutputSection::setPermissions(uint32_t c) {
	header.Characteristics &= ~permMask;
	header.Characteristics \|= c;
	}

	void OutputSection::merge(OutputSection *other) {
	chunks.insert(chunks.end(), other->chunks.begin(), other->chunks.end());
	other->chunks.clear();
	contribSections.insert(contribSections.end(), other->contribSections.begin(),
	other->contribSections.end());
	other->contribSections.clear();
	}

	// Write the section header to a given buffer.
	void OutputSection::writeHeaderTo(uint8_t *buf) {
	auto hdr = reinterpret_cast<coff_section >(buf);
	*hdr = header;
	if (stringTableOff) {
	// If name is too long, write offset into the string table as a name.
	sprintf(hdr->Name, "/%d", stringTableOff);
	} else {
	assert(!config->debug \|\| name.size() <= COFF::NameSize \|\|
	(hdr->Characteristics & IMAGE_SCN_MEM_DISCARDABLE) == 0);
	strncpy(hdr->Name, name.data(),
	std::min(name.size(), (size_t)COFF::NameSize));
	}
	}

	void OutputSection::addContributingPartialSection(PartialSection *sec) {
	contribSections.push_back(sec);
	}

	} // namespace coff
	} // namespace lld

	// Check whether the target address S is in range from a relocation
	// of type relType at address P.
	static bool isInRange(uint16_t relType, uint64_t s, uint64_t p, int margin) {
	if (config->machine == ARMNT) {
	int64_t diff = AbsoluteDifference(s, p + 4) + margin;
	switch (relType) {
	case IMAGE_REL_ARM_BRANCH20T:
	return isInt<21>(diff);
	case IMAGE_REL_ARM_BRANCH24T:
	case IMAGE_REL_ARM_BLX23T:
	return isInt<25>(diff);
	default:
	return true;
	}
	} else if (config->machine == ARM64) {
	int64_t diff = AbsoluteDifference(s, p) + margin;
	switch (relType) {
	case IMAGE_REL_ARM64_BRANCH26:
	return isInt<28>(diff);
	case IMAGE_REL_ARM64_BRANCH19:
	return isInt<21>(diff);
	case IMAGE_REL_ARM64_BRANCH14:
	return isInt<16>(diff);
	default:
	return true;
	}
	} else {
	llvm_unreachable("Unexpected architecture");
	}
	}

	// Return the last thunk for the given target if it is in range,
	// or create a new one.
	static std::pair<Defined *, bool>
	getThunk(DenseMap<uint64_t, Defined > &lastThunks, Defined target, uint64_t p,
	uint16_t type, int margin) {
	Defined *&lastThunk = lastThunks[target->getRVA()];
	if (lastThunk && isInRange(type, lastThunk->getRVA(), p, margin))
	return {lastThunk, false};
	Chunk *c;
	switch (config->machine) {
	case ARMNT:
	c = make<RangeExtensionThunkARM>(target);
	break;
	case ARM64:
	c = make<RangeExtensionThunkARM64>(target);
	break;
	default:
	llvm_unreachable("Unexpected architecture");
	}
	Defined *d = make<DefinedSynthetic>("", c);
	lastThunk = d;
	return {d, true};
	}

	// This checks all relocations, and for any relocation which isn't in range
	// it adds a thunk after the section chunk that contains the relocation.
	// If the latest thunk for the specific target is in range, that is used
	// instead of creating a new thunk. All range checks are done with the
	// specified margin, to make sure that relocations that originally are in
	// range, but only barely, also get thunks - in case other added thunks makes
	// the target go out of range.
	//
	// After adding thunks, we verify that all relocations are in range (with
	// no extra margin requirements). If this failed, we restart (throwing away
	// the previously created thunks) and retry with a wider margin.
	static bool createThunks(OutputSection *os, int margin) {
	bool addressesChanged = false;
	DenseMap<uint64_t, Defined *> lastThunks;
	DenseMap<std::pair<ObjFile , Defined >, uint32_t> thunkSymtabIndices;
	size_t thunksSize = 0;
	// Recheck Chunks.size() each iteration, since we can insert more
	// elements into it.
	for (size_t i = 0; i != os->chunks.size(); ++i) {
	SectionChunk *sc = dyn_cast_or_null<SectionChunk>(os->chunks[i]);
	if (!sc)
	continue;
	size_t thunkInsertionSpot = i + 1;

	// Try to get a good enough estimate of where new thunks will be placed.
	// Offset this by the size of the new thunks added so far, to make the
	// estimate slightly better.
	size_t thunkInsertionRVA = sc->getRVA() + sc->getSize() + thunksSize;
	ObjFile *file = sc->file;
	std::vector<std::pair<uint32_t, uint32_t>> relocReplacements;
	ArrayRef<coff_relocation> originalRelocs =
	file->getCOFFObj()->getRelocations(sc->header);
	for (size_t j = 0, e = originalRelocs.size(); j < e; ++j) {
	const coff_relocation &rel = originalRelocs[j];
	Symbol *relocTarget = file->getSymbol(rel.SymbolTableIndex);

	// The estimate of the source address P should be pretty accurate,
	// but we don't know whether the target Symbol address should be
	// offset by thunksSize or not (or by some of thunksSize but not all of
	// it), giving us some uncertainty once we have added one thunk.
	uint64_t p = sc->getRVA() + rel.VirtualAddress + thunksSize;

	Defined *sym = dyn_cast_or_null<Defined>(relocTarget);
	if (!sym)
	continue;

	uint64_t s = sym->getRVA();

	if (isInRange(rel.Type, s, p, margin))
	continue;

	// If the target isn't in range, hook it up to an existing or new
	// thunk.
	Defined *thunk;
	bool wasNew;
	std::tie(thunk, wasNew) = getThunk(lastThunks, sym, p, rel.Type, margin);
	if (wasNew) {
	Chunk *thunkChunk = thunk->getChunk();
	thunkChunk->setRVA(
	thunkInsertionRVA); // Estimate of where it will be located.
	os->chunks.insert(os->chunks.begin() + thunkInsertionSpot, thunkChunk);
	thunkInsertionSpot++;
	thunksSize += thunkChunk->getSize();
	thunkInsertionRVA += thunkChunk->getSize();
	addressesChanged = true;
	}

	// To redirect the relocation, add a symbol to the parent object file's
	// symbol table, and replace the relocation symbol table index with the
	// new index.
	auto insertion = thunkSymtabIndices.insert({{file, thunk}, ~0U});
	uint32_t &thunkSymbolIndex = insertion.first->second;
	if (insertion.second)
	thunkSymbolIndex = file->addRangeThunkSymbol(thunk);
	relocReplacements.push_back({j, thunkSymbolIndex});
	}

	// Get a writable copy of this section's relocations so they can be
	// modified. If the relocations point into the object file, allocate new
	// memory. Otherwise, this must be previously allocated memory that can be
	// modified in place.
	ArrayRef<coff_relocation> curRelocs = sc->getRelocs();
	MutableArrayRef<coff_relocation> newRelocs;
	if (originalRelocs.data() == curRelocs.data()) {
	newRelocs = makeMutableArrayRef(
	bAlloc.Allocate<coff_relocation>(originalRelocs.size()),
	originalRelocs.size());
	} else {
	newRelocs = makeMutableArrayRef(
	const_cast<coff_relocation *>(curRelocs.data()), curRelocs.size());
	}

	// Copy each relocation, but replace the symbol table indices which need
	// thunks.
	auto nextReplacement = relocReplacements.begin();
	auto endReplacement = relocReplacements.end();
	for (size_t i = 0, e = originalRelocs.size(); i != e; ++i) {
	newRelocs[i] = originalRelocs[i];
	if (nextReplacement != endReplacement && nextReplacement->first == i) {
	newRelocs[i].SymbolTableIndex = nextReplacement->second;
	++nextReplacement;
	}
	}

	sc->setRelocs(newRelocs);
	}
	return addressesChanged;
	}

	// Verify that all relocations are in range, with no extra margin requirements.
	static bool verifyRanges(const std::vector<Chunk *> chunks) {
	for (Chunk *c : chunks) {
	SectionChunk *sc = dyn_cast_or_null<SectionChunk>(c);
	if (!sc)
	continue;

	ArrayRef<coff_relocation> relocs = sc->getRelocs();
	for (size_t j = 0, e = relocs.size(); j < e; ++j) {
	const coff_relocation &rel = relocs[j];
	Symbol *relocTarget = sc->file->getSymbol(rel.SymbolTableIndex);

	Defined *sym = dyn_cast_or_null<Defined>(relocTarget);
	if (!sym)
	continue;

	uint64_t p = sc->getRVA() + rel.VirtualAddress;
	uint64_t s = sym->getRVA();

	if (!isInRange(rel.Type, s, p, 0))
	return false;
	}
	}
	return true;
	}

	// Assign addresses and add thunks if necessary.
	void Writer::finalizeAddresses() {
	assignAddresses();
	if (config->machine != ARMNT && config->machine != ARM64)
	return;

	size_t origNumChunks = 0;
	for (OutputSection *sec : outputSections) {
	sec->origChunks = sec->chunks;
	origNumChunks += sec->chunks.size();
	}

	int pass = 0;
	int margin = 1024 * 100;
	while (true) {
	// First check whether we need thunks at all, or if the previous pass of
	// adding them turned out ok.
	bool rangesOk = true;
	size_t numChunks = 0;
	for (OutputSection *sec : outputSections) {
	if (!verifyRanges(sec->chunks)) {
	rangesOk = false;
	break;
	}
	numChunks += sec->chunks.size();
	}
	if (rangesOk) {
	if (pass > 0)
	log("Added " + Twine(numChunks - origNumChunks) + " thunks with " +
	"margin " + Twine(margin) + " in " + Twine(pass) + " passes");
	return;
	}

	if (pass >= 10)
	fatal("adding thunks hasn't converged after " + Twine(pass) + " passes");

	if (pass > 0) {
	// If the previous pass didn't work out, reset everything back to the
	// original conditions before retrying with a wider margin. This should
	// ideally never happen under real circumstances.
	for (OutputSection *sec : outputSections)
	sec->chunks = sec->origChunks;
	margin *= 2;
	}

	// Try adding thunks everywhere where it is needed, with a margin
	// to avoid things going out of range due to the added thunks.
	bool addressesChanged = false;
	for (OutputSection *sec : outputSections)
	addressesChanged \|= createThunks(sec, margin);
	// If the verification above thought we needed thunks, we should have
	// added some.
	assert(addressesChanged);

	// Recalculate the layout for the whole image (and verify the ranges at
	// the start of the next round).
	assignAddresses();

	pass++;
	}
	}

	// The main function of the writer.
	void Writer::run() {
	ScopedTimer t1(codeLayoutTimer);

	createImportTables();
	createSections();
	createMiscChunks();
	appendImportThunks();
	createExportTable();
	mergeSections();
	removeUnusedSections();
	finalizeAddresses();
	removeEmptySections();
	assignOutputSectionIndices();
	setSectionPermissions();
	createSymbolAndStringTable();

	if (fileSize > UINT32_MAX)
	fatal("image size (" + Twine(fileSize) + ") " +
	"exceeds maximum allowable size (" + Twine(UINT32_MAX) + ")");

	openFile(config->outputFile);
	if (config->is64()) {
	writeHeader<pe32plus_header>();
	} else {
	writeHeader<pe32_header>();
	}
	writeSections();
	sortExceptionTable();

	t1.stop();

	if (!config->pdbPath.empty() && config->debug) {
	assert(buildId);
	createPDB(symtab, outputSections, sectionTable, buildId->buildId);
	}
	writeBuildId();

	writeMapFile(outputSections);

	+ if (errorCount())
	+ return;
	+
	ScopedTimer t2(diskCommitTimer);
	if (auto e = buffer->commit())
	fatal("failed to write the output file: " + toString(std::move(e)));
	}

	static StringRef getOutputSectionName(StringRef name) {
	StringRef s = name.split('$').first;

	// Treat a later period as a separator for MinGW, for sections like
	// ".ctors.01234".
	return s.substr(0, s.find('.', 1));
	}

	// For /order.
	static void sortBySectionOrder(std::vector<Chunk *> &chunks) {
	auto getPriority = [](const Chunk *c) {
	if (auto *sec = dyn_cast<SectionChunk>(c))
	if (sec->sym)
	return config->order.lookup(sec->sym->getName());
	return 0;
	};

	llvm::stable_sort(chunks, [=](const Chunk a, const Chunk b) {
	return getPriority(a) < getPriority(b);
	});
	}

	// Change the characteristics of existing PartialSections that belong to the
	// section Name to Chars.
	void Writer::fixPartialSectionChars(StringRef name, uint32_t chars) {
	for (auto it : partialSections) {
	PartialSection *pSec = it.second;
	StringRef curName = pSec->name;
	if (!curName.consume_front(name) \|\|
	(!curName.empty() && !curName.startswith("$")))
	continue;
	if (pSec->characteristics == chars)
	continue;
	PartialSection *destSec = createPartialSection(pSec->name, chars);
	destSec->chunks.insert(destSec->chunks.end(), pSec->chunks.begin(),
	pSec->chunks.end());
	pSec->chunks.clear();
	}
	}

	// Sort concrete section chunks from GNU import libraries.
	//
	// GNU binutils doesn't use short import files, but instead produces import
	// libraries that consist of object files, with section chunks for the .idata$*
	// sections. These are linked just as regular static libraries. Each import
	// library consists of one header object, one object file for every imported
	// symbol, and one trailer object. In order for the .idata tables/lists to
	// be formed correctly, the section chunks within each .idata$* section need
	// to be grouped by library, and sorted alphabetically within each library
	// (which makes sure the header comes first and the trailer last).
	bool Writer::fixGnuImportChunks() {
	uint32_t rdata = IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ;

	// Make sure all .idata$* section chunks are mapped as RDATA in order to
	// be sorted into the same sections as our own synthesized .idata chunks.
	fixPartialSectionChars(".idata", rdata);

	bool hasIdata = false;
	// Sort all .idata$* chunks, grouping chunks from the same library,
	// with alphabetical ordering of the object fils within a library.
	for (auto it : partialSections) {
	PartialSection *pSec = it.second;
	if (!pSec->name.startswith(".idata"))
	continue;

	if (!pSec->chunks.empty())
	hasIdata = true;
	llvm::stable_sort(pSec->chunks, [&](Chunk s, Chunk t) {
	SectionChunk *sc1 = dyn_cast_or_null<SectionChunk>(s);
	SectionChunk *sc2 = dyn_cast_or_null<SectionChunk>(t);
	if (!sc1 \|\| !sc2) {
	// if SC1, order them ascending. If SC2 or both null,
	// S is not less than T.
	return sc1 != nullptr;
	}
	// Make a string with "libraryname/objectfile" for sorting, achieving
	// both grouping by library and sorting of objects within a library,
	// at once.
	std::string key1 =
	(sc1->file->parentName + "/" + sc1->file->getName()).str();
	std::string key2 =
	(sc2->file->parentName + "/" + sc2->file->getName()).str();
	return key1 < key2;
	});
	}
	return hasIdata;
	}

	// Add generated idata chunks, for imported symbols and DLLs, and a
	// terminator in .idata$2.
	void Writer::addSyntheticIdata() {
	uint32_t rdata = IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ;
	idata.create();

	// Add the .idata content in the right section groups, to allow
	// chunks from other linked in object files to be grouped together.
	// See Microsoft PE/COFF spec 5.4 for details.
	auto add = [&](StringRef n, std::vector<Chunk *> &v) {
	PartialSection *pSec = createPartialSection(n, rdata);
	pSec->chunks.insert(pSec->chunks.end(), v.begin(), v.end());
	};

	// The loader assumes a specific order of data.
	// Add each type in the correct order.
	add(".idata$2", idata.dirs);
	add(".idata$4", idata.lookups);
	add(".idata$5", idata.addresses);
	add(".idata$6", idata.hints);
	add(".idata$7", idata.dllNames);
	}

	// Locate the first Chunk and size of the import directory list and the
	// IAT.
	void Writer::locateImportTables() {
	uint32_t rdata = IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ;

	if (PartialSection *importDirs = findPartialSection(".idata$2", rdata)) {
	if (!importDirs->chunks.empty())
	importTableStart = importDirs->chunks.front();
	for (Chunk *c : importDirs->chunks)
	importTableSize += c->getSize();
	}

	if (PartialSection *importAddresses = findPartialSection(".idata$5", rdata)) {
	if (!importAddresses->chunks.empty())
	iatStart = importAddresses->chunks.front();
	for (Chunk *c : importAddresses->chunks)
	iatSize += c->getSize();
	}
	}

	// Return whether a SectionChunk's suffix (the dollar and any trailing
	// suffix) should be removed and sorted into the main suffixless
	// PartialSection.
	static bool shouldStripSectionSuffix(SectionChunk *sc, StringRef name) {
	// On MinGW, comdat groups are formed by putting the comdat group name
	// after the '$' in the section name. For .eh_frame$<symbol>, that must
	// still be sorted before the .eh_frame trailer from crtend.o, thus just
	// strip the section name trailer. For other sections, such as
	// .tls$$<symbol> (where non-comdat .tls symbols are otherwise stored in
	// ".tls$"), they must be strictly sorted after .tls. And for the
	// hypothetical case of comdat .CRT$XCU, we definitely need to keep the
	// suffix for sorting. Thus, to play it safe, only strip the suffix for
	// the standard sections.
	if (!config->mingw)
	return false;
	if (!sc \|\| !sc->isCOMDAT())
	return false;
	return name.startswith(".text$") \|\| name.startswith(".data$") \|\|
	name.startswith(".rdata$") \|\| name.startswith(".pdata$") \|\|
	name.startswith(".xdata$") \|\| name.startswith(".eh_frame$");
	}

	// Create output section objects and add them to OutputSections.
	void Writer::createSections() {
	// First, create the builtin sections.
	const uint32_t data = IMAGE_SCN_CNT_INITIALIZED_DATA;
	const uint32_t bss = IMAGE_SCN_CNT_UNINITIALIZED_DATA;
	const uint32_t code = IMAGE_SCN_CNT_CODE;
	const uint32_t discardable = IMAGE_SCN_MEM_DISCARDABLE;
	const uint32_t r = IMAGE_SCN_MEM_READ;
	const uint32_t w = IMAGE_SCN_MEM_WRITE;
	const uint32_t x = IMAGE_SCN_MEM_EXECUTE;

	SmallDenseMap<std::pair<StringRef, uint32_t>, OutputSection *> sections;
	auto createSection = [&](StringRef name, uint32_t outChars) {
	OutputSection *&sec = sections[{name, outChars}];
	if (!sec) {
	sec = make<OutputSection>(name, outChars);
	outputSections.push_back(sec);
	}
	return sec;
	};

	// Try to match the section order used by link.exe.
	textSec = createSection(".text", code \| r \| x);
	createSection(".bss", bss \| r \| w);
	rdataSec = createSection(".rdata", data \| r);
	buildidSec = createSection(".buildid", data \| r);
	dataSec = createSection(".data", data \| r \| w);
	pdataSec = createSection(".pdata", data \| r);
	idataSec = createSection(".idata", data \| r);
	edataSec = createSection(".edata", data \| r);
	didatSec = createSection(".didat", data \| r);
	rsrcSec = createSection(".rsrc", data \| r);
	relocSec = createSection(".reloc", data \| discardable \| r);
	ctorsSec = createSection(".ctors", data \| r \| w);
	dtorsSec = createSection(".dtors", data \| r \| w);

	// Then bin chunks by name and output characteristics.
	for (Chunk *c : symtab->getChunks()) {
	auto *sc = dyn_cast<SectionChunk>(c);
	if (sc && !sc->live) {
	if (config->verbose)
	sc->printDiscardedMessage();
	continue;
	}
	StringRef name = c->getSectionName();
	if (shouldStripSectionSuffix(sc, name))
	name = name.split('$').first;
	PartialSection *pSec = createPartialSection(name,
	c->getOutputCharacteristics());
	pSec->chunks.push_back(c);
	}

	fixPartialSectionChars(".rsrc", data \| r);
	// Even in non MinGW cases, we might need to link against GNU import
	// libraries.
	bool hasIdata = fixGnuImportChunks();
	if (!idata.empty())
	hasIdata = true;

	if (hasIdata)
	addSyntheticIdata();

	// Process an /order option.
	if (!config->order.empty())
	for (auto it : partialSections)
	sortBySectionOrder(it.second->chunks);

	if (hasIdata)
	locateImportTables();

	// Then create an OutputSection for each section.
	// '$' and all following characters in input section names are
	// discarded when determining output section. So, .text$foo
	// contributes to .text, for example. See PE/COFF spec 3.2.
	for (auto it : partialSections) {
	PartialSection *pSec = it.second;
	StringRef name = getOutputSectionName(pSec->name);
	uint32_t outChars = pSec->characteristics;

	if (name == ".CRT") {
	// In link.exe, there is a special case for the I386 target where .CRT
	// sections are treated as if they have output characteristics DATA \| R if
	// their characteristics are DATA \| R \| W. This implements the same
	// special case for all architectures.
	outChars = data \| r;

	log("Processing section " + pSec->name + " -> " + name);

	sortCRTSectionChunks(pSec->chunks);
	}

	OutputSection *sec = createSection(name, outChars);
	for (Chunk *c : pSec->chunks)
	sec->addChunk(c);

	sec->addContributingPartialSection(pSec);
	}

	// Finally, move some output sections to the end.
	auto sectionOrder = [&](const OutputSection *s) {
	// Move DISCARDABLE (or non-memory-mapped) sections to the end of file
	// because the loader cannot handle holes. Stripping can remove other
	// discardable ones than .reloc, which is first of them (created early).
	if (s->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE)
	return 2;
	// .rsrc should come at the end of the non-discardable sections because its
	// size may change by the Win32 UpdateResources() function, causing
	// subsequent sections to move (see https://crbug.com/827082).
	if (s == rsrcSec)
	return 1;
	return 0;
	};
	llvm::stable_sort(outputSections,
	[&](const OutputSection s, const OutputSection t) {
	return sectionOrder(s) < sectionOrder(t);
	});
	}

	void Writer::createMiscChunks() {
	for (MergeChunk *p : MergeChunk::instances) {
	if (p) {
	p->finalizeContents();
	rdataSec->addChunk(p);
	}
	}

	// Create thunks for locally-dllimported symbols.
	if (!symtab->localImportChunks.empty()) {
	for (Chunk *c : symtab->localImportChunks)
	rdataSec->addChunk(c);
	}

	// Create Debug Information Chunks
	OutputSection *debugInfoSec = config->mingw ? buildidSec : rdataSec;
	if (config->debug \|\| config->repro) {
	debugDirectory = make<DebugDirectoryChunk>(debugRecords, config->repro);
	debugInfoSec->addChunk(debugDirectory);
	}

	if (config->debug) {
	// Make a CVDebugRecordChunk even when /DEBUG:CV is not specified. We
	// output a PDB no matter what, and this chunk provides the only means of
	// allowing a debugger to match a PDB and an executable. So we need it even
	// if we're ultimately not going to write CodeView data to the PDB.
	buildId = make<CVDebugRecordChunk>();
	debugRecords.push_back(buildId);

	for (Chunk *c : debugRecords)
	debugInfoSec->addChunk(c);
	}

	// Create SEH table. x86-only.
	if (config->safeSEH)
	createSEHTable();

	// Create /guard:cf tables if requested.
	if (config->guardCF != GuardCFLevel::Off)
	createGuardCFTables();

	if (config->mingw) {
	createRuntimePseudoRelocs();

	insertCtorDtorSymbols();
	}
	}

	// Create .idata section for the DLL-imported symbol table.
	// The format of this section is inherently Windows-specific.
	// IdataContents class abstracted away the details for us,
	// so we just let it create chunks and add them to the section.
	void Writer::createImportTables() {
	// Initialize DLLOrder so that import entries are ordered in
	// the same order as in the command line. (That affects DLL
	// initialization order, and this ordering is MSVC-compatible.)
	for (ImportFile *file : ImportFile::instances) {
	if (!file->live)
	continue;

	std::string dll = StringRef(file->dllName).lower();
	if (config->dllOrder.count(dll) == 0)
	config->dllOrder[dll] = config->dllOrder.size();

	if (file->impSym && !isa<DefinedImportData>(file->impSym))
	fatal(toString(*file->impSym) + " was replaced");
	DefinedImportData *impSym = cast_or_null<DefinedImportData>(file->impSym);
	if (config->delayLoads.count(StringRef(file->dllName).lower())) {
	if (!file->thunkSym)
	fatal("cannot delay-load " + toString(file) +
	" due to import of data: " + toString(*impSym));
	delayIdata.add(impSym);
	} else {
	idata.add(impSym);
	}
	}
	}

	void Writer::appendImportThunks() {
	if (ImportFile::instances.empty())
	return;

	for (ImportFile *file : ImportFile::instances) {
	if (!file->live)
	continue;

	if (!file->thunkSym)
	continue;

	if (!isa<DefinedImportThunk>(file->thunkSym))
	fatal(toString(*file->thunkSym) + " was replaced");
	DefinedImportThunk *thunk = cast<DefinedImportThunk>(file->thunkSym);
	if (file->thunkLive)
	textSec->addChunk(thunk->getChunk());
	}

	if (!delayIdata.empty()) {
	Defined *helper = cast<Defined>(config->delayLoadHelper);
	delayIdata.create(helper);
	for (Chunk *c : delayIdata.getChunks())
	didatSec->addChunk(c);
	for (Chunk *c : delayIdata.getDataChunks())
	dataSec->addChunk(c);
	for (Chunk *c : delayIdata.getCodeChunks())
	textSec->addChunk(c);
	}
	}

	void Writer::createExportTable() {
	if (config->exports.empty())
	return;
	for (Chunk *c : edata.chunks)
	edataSec->addChunk(c);
	}

	void Writer::removeUnusedSections() {
	// Remove sections that we can be sure won't get content, to avoid
	// allocating space for their section headers.
	auto isUnused = [this](OutputSection *s) {
	if (s == relocSec)
	return false; // This section is populated later.
	// MergeChunks have zero size at this point, as their size is finalized
	// later. Only remove sections that have no Chunks at all.
	return s->chunks.empty();
	};
	outputSections.erase(
	std::remove_if(outputSections.begin(), outputSections.end(), isUnused),
	outputSections.end());
	}

	// The Windows loader doesn't seem to like empty sections,
	// so we remove them if any.
	void Writer::removeEmptySections() {
	auto isEmpty = [](OutputSection *s) { return s->getVirtualSize() == 0; };
	outputSections.erase(
	std::remove_if(outputSections.begin(), outputSections.end(), isEmpty),
	outputSections.end());
	}

	void Writer::assignOutputSectionIndices() {
	// Assign final output section indices, and assign each chunk to its output
	// section.
	uint32_t idx = 1;
	for (OutputSection *os : outputSections) {
	os->sectionIndex = idx;
	for (Chunk *c : os->chunks)
	c->setOutputSectionIdx(idx);
	++idx;
	}

	// Merge chunks are containers of chunks, so assign those an output section
	// too.
	for (MergeChunk *mc : MergeChunk::instances)
	if (mc)
	for (SectionChunk *sc : mc->sections)
	if (sc && sc->live)
	sc->setOutputSectionIdx(mc->getOutputSectionIdx());
	}

	size_t Writer::addEntryToStringTable(StringRef str) {
	assert(str.size() > COFF::NameSize);
	size_t offsetOfEntry = strtab.size() + 4; // +4 for the size field
	strtab.insert(strtab.end(), str.begin(), str.end());
	strtab.push_back('\0');
	return offsetOfEntry;
	}

	Optional<coff_symbol16> Writer::createSymbol(Defined *def) {
	coff_symbol16 sym;
	switch (def->kind()) {
	case Symbol::DefinedAbsoluteKind:
	sym.Value = def->getRVA();
	sym.SectionNumber = IMAGE_SYM_ABSOLUTE;
	break;
	case Symbol::DefinedSyntheticKind:
	// Relative symbols are unrepresentable in a COFF symbol table.
	return None;
	default: {
	// Don't write symbols that won't be written to the output to the symbol
	// table.
	Chunk *c = def->getChunk();
	if (!c)
	return None;
	OutputSection *os = c->getOutputSection();
	if (!os)
	return None;

	sym.Value = def->getRVA() - os->getRVA();
	sym.SectionNumber = os->sectionIndex;
	break;
	}
	}

	// Symbols that are runtime pseudo relocations don't point to the actual
	// symbol data itself (as they are imported), but points to the IAT entry
	// instead. Avoid emitting them to the symbol table, as they can confuse
	// debuggers.
	if (def->isRuntimePseudoReloc)
	return None;

	StringRef name = def->getName();
	if (name.size() > COFF::NameSize) {
	sym.Name.Offset.Zeroes = 0;
	sym.Name.Offset.Offset = addEntryToStringTable(name);
	} else {
	memset(sym.Name.ShortName, 0, COFF::NameSize);
	memcpy(sym.Name.ShortName, name.data(), name.size());
	}

	if (auto *d = dyn_cast<DefinedCOFF>(def)) {
	COFFSymbolRef ref = d->getCOFFSymbol();
	sym.Type = ref.getType();
	sym.StorageClass = ref.getStorageClass();
	} else {
	sym.Type = IMAGE_SYM_TYPE_NULL;
	sym.StorageClass = IMAGE_SYM_CLASS_EXTERNAL;
	}
	sym.NumberOfAuxSymbols = 0;
	return sym;
	}

	void Writer::createSymbolAndStringTable() {
	// PE/COFF images are limited to 8 byte section names. Longer names can be
	// supported by writing a non-standard string table, but this string table is
	// not mapped at runtime and the long names will therefore be inaccessible.
	// link.exe always truncates section names to 8 bytes, whereas binutils always
	// preserves long section names via the string table. LLD adopts a hybrid
	// solution where discardable sections have long names preserved and
	// non-discardable sections have their names truncated, to ensure that any
	// section which is mapped at runtime also has its name mapped at runtime.
	for (OutputSection *sec : outputSections) {
	if (sec->name.size() <= COFF::NameSize)
	continue;
	if ((sec->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE) == 0)
	continue;
	sec->setStringTableOff(addEntryToStringTable(sec->name));
	}

	if (config->debugDwarf \|\| config->debugSymtab) {
	for (ObjFile *file : ObjFile::instances) {
	for (Symbol *b : file->getSymbols()) {
	auto *d = dyn_cast_or_null<Defined>(b);
	if (!d \|\| d->writtenToSymtab)
	continue;
	d->writtenToSymtab = true;

	if (Optional<coff_symbol16> sym = createSymbol(d))
	outputSymtab.push_back(*sym);
	}
	}
	}

	if (outputSymtab.empty() && strtab.empty())
	return;

	// We position the symbol table to be adjacent to the end of the last section.
	uint64_t fileOff = fileSize;
	pointerToSymbolTable = fileOff;
	fileOff += outputSymtab.size() * sizeof(coff_symbol16);
	fileOff += 4 + strtab.size();
	fileSize = alignTo(fileOff, config->fileAlign);
	}

	void Writer::mergeSections() {
	if (!pdataSec->chunks.empty()) {
	firstPdata = pdataSec->chunks.front();
	lastPdata = pdataSec->chunks.back();
	}

	for (auto &p : config->merge) {
	StringRef toName = p.second;
	if (p.first == toName)
	continue;
	StringSet<> names;
	while (1) {
	if (!names.insert(toName).second)
	fatal("/merge: cycle found for section '" + p.first + "'");
	auto i = config->merge.find(toName);
	if (i == config->merge.end())
	break;
	toName = i->second;
	}
	OutputSection *from = findSection(p.first);
	OutputSection *to = findSection(toName);
	if (!from)
	continue;
	if (!to) {
	from->name = toName;
	continue;
	}
	to->merge(from);
	}
	}

	// Visits all sections to assign incremental, non-overlapping RVAs and
	// file offsets.
	void Writer::assignAddresses() {
	sizeOfHeaders = dosStubSize + sizeof(PEMagic) + sizeof(coff_file_header) +
	sizeof(data_directory) * numberOfDataDirectory +
	sizeof(coff_section) * outputSections.size();
	sizeOfHeaders +=
	config->is64() ? sizeof(pe32plus_header) : sizeof(pe32_header);
	sizeOfHeaders = alignTo(sizeOfHeaders, config->fileAlign);
	- uint64_t rva = pageSize; // The first page is kept unmapped.
	fileSize = sizeOfHeaders;

	+ // The first page is kept unmapped.
	+ uint64_t rva = alignTo(sizeOfHeaders, config->align);
	+
	for (OutputSection *sec : outputSections) {
	if (sec == relocSec)
	addBaserels();
	uint64_t rawSize = 0, virtualSize = 0;
	sec->header.VirtualAddress = rva;

	// If /FUNCTIONPADMIN is used, functions are padded in order to create a
	// hotpatchable image.
	const bool isCodeSection =
	(sec->header.Characteristics & IMAGE_SCN_CNT_CODE) &&
	(sec->header.Characteristics & IMAGE_SCN_MEM_READ) &&
	(sec->header.Characteristics & IMAGE_SCN_MEM_EXECUTE);
	uint32_t padding = isCodeSection ? config->functionPadMin : 0;

	for (Chunk *c : sec->chunks) {
	if (padding && c->isHotPatchable())
	virtualSize += padding;
	virtualSize = alignTo(virtualSize, c->getAlignment());
	c->setRVA(rva + virtualSize);
	virtualSize += c->getSize();
	if (c->hasData)
	rawSize = alignTo(virtualSize, config->fileAlign);
	}
	if (virtualSize > UINT32_MAX)
	error("section larger than 4 GiB: " + sec->name);
	sec->header.VirtualSize = virtualSize;
	sec->header.SizeOfRawData = rawSize;
	if (rawSize != 0)
	sec->header.PointerToRawData = fileSize;
	- rva += alignTo(virtualSize, pageSize);
	+ rva += alignTo(virtualSize, config->align);
	fileSize += alignTo(rawSize, config->fileAlign);
	}
	- sizeOfImage = alignTo(rva, pageSize);
	+ sizeOfImage = alignTo(rva, config->align);

	// Assign addresses to sections in MergeChunks.
	for (MergeChunk *mc : MergeChunk::instances)
	if (mc)
	mc->assignSubsectionRVAs();
	}

	template <typename PEHeaderTy> void Writer::writeHeader() {
	// Write DOS header. For backwards compatibility, the first part of a PE/COFF
	// executable consists of an MS-DOS MZ executable. If the executable is run
	// under DOS, that program gets run (usually to just print an error message).
	// When run under Windows, the loader looks at AddressOfNewExeHeader and uses
	// the PE header instead.
	uint8_t *buf = buffer->getBufferStart();
	auto dos = reinterpret_cast<dos_header >(buf);
	buf += sizeof(dos_header);
	dos->Magic[0] = 'M';
	dos->Magic[1] = 'Z';
	dos->UsedBytesInTheLastPage = dosStubSize % 512;
	dos->FileSizeInPages = divideCeil(dosStubSize, 512);
	dos->HeaderSizeInParagraphs = sizeof(dos_header) / 16;

	dos->AddressOfRelocationTable = sizeof(dos_header);
	dos->AddressOfNewExeHeader = dosStubSize;

	// Write DOS program.
	memcpy(buf, dosProgram, sizeof(dosProgram));
	buf += sizeof(dosProgram);

	// Write PE magic
	memcpy(buf, PEMagic, sizeof(PEMagic));
	buf += sizeof(PEMagic);

	// Write COFF header
	auto coff = reinterpret_cast<coff_file_header >(buf);
	buf += sizeof(*coff);
	coff->Machine = config->machine;
	coff->NumberOfSections = outputSections.size();
	coff->Characteristics = IMAGE_FILE_EXECUTABLE_IMAGE;
	if (config->largeAddressAware)
	coff->Characteristics \|= IMAGE_FILE_LARGE_ADDRESS_AWARE;
	if (!config->is64())
	coff->Characteristics \|= IMAGE_FILE_32BIT_MACHINE;
	if (config->dll)
	coff->Characteristics \|= IMAGE_FILE_DLL;
	if (!config->relocatable)
	coff->Characteristics \|= IMAGE_FILE_RELOCS_STRIPPED;
	if (config->swaprunCD)
	coff->Characteristics \|= IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP;
	if (config->swaprunNet)
	coff->Characteristics \|= IMAGE_FILE_NET_RUN_FROM_SWAP;
	coff->SizeOfOptionalHeader =
	sizeof(PEHeaderTy) + sizeof(data_directory) * numberOfDataDirectory;

	// Write PE header
	auto pe = reinterpret_cast<PEHeaderTy >(buf);
	buf += sizeof(*pe);
	pe->Magic = config->is64() ? PE32Header::PE32_PLUS : PE32Header::PE32;

	// If {Major,Minor}LinkerVersion is left at 0.0, then for some
	// reason signing the resulting PE file with Authenticode produces a
	// signature that fails to validate on Windows 7 (but is OK on 10).
	// Set it to 14.0, which is what VS2015 outputs, and which avoids
	// that problem.
	pe->MajorLinkerVersion = 14;
	pe->MinorLinkerVersion = 0;

	pe->ImageBase = config->imageBase;
	- pe->SectionAlignment = pageSize;
	+ pe->SectionAlignment = config->align;
	pe->FileAlignment = config->fileAlign;
	pe->MajorImageVersion = config->majorImageVersion;
	pe->MinorImageVersion = config->minorImageVersion;
	pe->MajorOperatingSystemVersion = config->majorOSVersion;
	pe->MinorOperatingSystemVersion = config->minorOSVersion;
	pe->MajorSubsystemVersion = config->majorOSVersion;
	pe->MinorSubsystemVersion = config->minorOSVersion;
	pe->Subsystem = config->subsystem;
	pe->SizeOfImage = sizeOfImage;
	pe->SizeOfHeaders = sizeOfHeaders;
	if (!config->noEntry) {
	Defined *entry = cast<Defined>(config->entry);
	pe->AddressOfEntryPoint = entry->getRVA();
	// Pointer to thumb code must have the LSB set, so adjust it.
	if (config->machine == ARMNT)
	pe->AddressOfEntryPoint \|= 1;
	}
	pe->SizeOfStackReserve = config->stackReserve;
	pe->SizeOfStackCommit = config->stackCommit;
	pe->SizeOfHeapReserve = config->heapReserve;
	pe->SizeOfHeapCommit = config->heapCommit;
	if (config->appContainer)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_APPCONTAINER;
	if (config->dynamicBase)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE;
	if (config->highEntropyVA)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA;
	if (!config->allowBind)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_NO_BIND;
	if (config->nxCompat)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_NX_COMPAT;
	if (!config->allowIsolation)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_NO_ISOLATION;
	if (config->guardCF != GuardCFLevel::Off)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_GUARD_CF;
	if (config->integrityCheck)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY;
	if (setNoSEHCharacteristic)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_NO_SEH;
	if (config->terminalServerAware)
	pe->DLLCharacteristics \|= IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE;
	pe->NumberOfRvaAndSize = numberOfDataDirectory;
	if (textSec->getVirtualSize()) {
	pe->BaseOfCode = textSec->getRVA();
	pe->SizeOfCode = textSec->getRawSize();
	}
	pe->SizeOfInitializedData = getSizeOfInitializedData();

	// Write data directory
	auto dir = reinterpret_cast<data_directory >(buf);
	buf += sizeof(dir) numberOfDataDirectory;
	if (!config->exports.empty()) {
	dir[EXPORT_TABLE].RelativeVirtualAddress = edata.getRVA();
	dir[EXPORT_TABLE].Size = edata.getSize();
	}
	if (importTableStart) {
	dir[IMPORT_TABLE].RelativeVirtualAddress = importTableStart->getRVA();
	dir[IMPORT_TABLE].Size = importTableSize;
	}
	if (iatStart) {
	dir[IAT].RelativeVirtualAddress = iatStart->getRVA();
	dir[IAT].Size = iatSize;
	}
	if (rsrcSec->getVirtualSize()) {
	dir[RESOURCE_TABLE].RelativeVirtualAddress = rsrcSec->getRVA();
	dir[RESOURCE_TABLE].Size = rsrcSec->getVirtualSize();
	}
	if (firstPdata) {
	dir[EXCEPTION_TABLE].RelativeVirtualAddress = firstPdata->getRVA();
	dir[EXCEPTION_TABLE].Size =
	lastPdata->getRVA() + lastPdata->getSize() - firstPdata->getRVA();
	}
	if (relocSec->getVirtualSize()) {
	dir[BASE_RELOCATION_TABLE].RelativeVirtualAddress = relocSec->getRVA();
	dir[BASE_RELOCATION_TABLE].Size = relocSec->getVirtualSize();
	}
	if (Symbol *sym = symtab->findUnderscore("_tls_used")) {
	if (Defined *b = dyn_cast<Defined>(sym)) {
	dir[TLS_TABLE].RelativeVirtualAddress = b->getRVA();
	dir[TLS_TABLE].Size = config->is64()
	? sizeof(object::coff_tls_directory64)
	: sizeof(object::coff_tls_directory32);
	}
	}
	if (debugDirectory) {
	dir[DEBUG_DIRECTORY].RelativeVirtualAddress = debugDirectory->getRVA();
	dir[DEBUG_DIRECTORY].Size = debugDirectory->getSize();
	}
	if (Symbol *sym = symtab->findUnderscore("_load_config_used")) {
	if (auto *b = dyn_cast<DefinedRegular>(sym)) {
	SectionChunk *sc = b->getChunk();
	assert(b->getRVA() >= sc->getRVA());
	uint64_t offsetInChunk = b->getRVA() - sc->getRVA();
	if (!sc->hasData \|\| offsetInChunk + 4 > sc->getSize())
	fatal("_load_config_used is malformed");

	ArrayRef<uint8_t> secContents = sc->getContents();
	uint32_t loadConfigSize =
	reinterpret_cast<const ulittle32_t >(&secContents[offsetInChunk]);
	if (offsetInChunk + loadConfigSize > sc->getSize())
	fatal("_load_config_used is too large");
	dir[LOAD_CONFIG_TABLE].RelativeVirtualAddress = b->getRVA();
	dir[LOAD_CONFIG_TABLE].Size = loadConfigSize;
	}
	}
	if (!delayIdata.empty()) {
	dir[DELAY_IMPORT_DESCRIPTOR].RelativeVirtualAddress =
	delayIdata.getDirRVA();
	dir[DELAY_IMPORT_DESCRIPTOR].Size = delayIdata.getDirSize();
	}

	// Write section table
	for (OutputSection *sec : outputSections) {
	sec->writeHeaderTo(buf);
	buf += sizeof(coff_section);
	}
	sectionTable = ArrayRef<uint8_t>(
	buf - outputSections.size() * sizeof(coff_section), buf);

	if (outputSymtab.empty() && strtab.empty())
	return;

	coff->PointerToSymbolTable = pointerToSymbolTable;
	uint32_t numberOfSymbols = outputSymtab.size();
	coff->NumberOfSymbols = numberOfSymbols;
	auto symbolTable = reinterpret_cast<coff_symbol16 >(
	buffer->getBufferStart() + coff->PointerToSymbolTable);
	for (size_t i = 0; i != numberOfSymbols; ++i)
	symbolTable[i] = outputSymtab[i];
	// Create the string table, it follows immediately after the symbol table.
	// The first 4 bytes is length including itself.
	buf = reinterpret_cast<uint8_t *>(&symbolTable[numberOfSymbols]);
	write32le(buf, strtab.size() + 4);
	if (!strtab.empty())
	memcpy(buf + 4, strtab.data(), strtab.size());
	}

	void Writer::openFile(StringRef path) {
	buffer = CHECK(
	FileOutputBuffer::create(path, fileSize, FileOutputBuffer::F_executable),
	"failed to open " + path);
	}

	void Writer::createSEHTable() {
	SymbolRVASet handlers;
	for (ObjFile *file : ObjFile::instances) {
	if (!file->hasSafeSEH())
	error("/safeseh: " + file->getName() + " is not compatible with SEH");
	markSymbolsForRVATable(file, file->getSXDataChunks(), handlers);
	}

	// Set the "no SEH" characteristic if there really were no handlers, or if
	// there is no load config object to point to the table of handlers.
	setNoSEHCharacteristic =
	handlers.empty() \|\| !symtab->findUnderscore("_load_config_used");

	maybeAddRVATable(std::move(handlers), "__safe_se_handler_table",
	"__safe_se_handler_count");
	}

	// Add a symbol to an RVA set. Two symbols may have the same RVA, but an RVA set
	// cannot contain duplicates. Therefore, the set is uniqued by Chunk and the
	// symbol's offset into that Chunk.
	static void addSymbolToRVASet(SymbolRVASet &rvaSet, Defined *s) {
	Chunk *c = s->getChunk();
	if (auto *sc = dyn_cast<SectionChunk>(c))
	c = sc->repl; // Look through ICF replacement.
	uint32_t off = s->getRVA() - (c ? c->getRVA() : 0);
	rvaSet.insert({c, off});
	}

	// Given a symbol, add it to the GFIDs table if it is a live, defined, function
	// symbol in an executable section.
	static void maybeAddAddressTakenFunction(SymbolRVASet &addressTakenSyms,
	Symbol *s) {
	if (!s)
	return;

	switch (s->kind()) {
	case Symbol::DefinedLocalImportKind:
	case Symbol::DefinedImportDataKind:
	// Defines an __imp_ pointer, so it is data, so it is ignored.
	break;
	case Symbol::DefinedCommonKind:
	// Common is always data, so it is ignored.
	break;
	case Symbol::DefinedAbsoluteKind:
	case Symbol::DefinedSyntheticKind:
	// Absolute is never code, synthetic generally isn't and usually isn't
	// determinable.
	break;
	case Symbol::LazyKind:
	case Symbol::UndefinedKind:
	// Undefined symbols resolve to zero, so they don't have an RVA. Lazy
	// symbols shouldn't have relocations.
	break;

	case Symbol::DefinedImportThunkKind:
	// Thunks are always code, include them.
	addSymbolToRVASet(addressTakenSyms, cast<Defined>(s));
	break;

	case Symbol::DefinedRegularKind: {
	// This is a regular, defined, symbol from a COFF file. Mark the symbol as
	// address taken if the symbol type is function and it's in an executable
	// section.
	auto *d = cast<DefinedRegular>(s);
	if (d->getCOFFSymbol().getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
	SectionChunk *sc = dyn_cast<SectionChunk>(d->getChunk());
	if (sc && sc->live &&
	sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE)
	addSymbolToRVASet(addressTakenSyms, d);
	}
	break;
	}
	}
	}

	// Visit all relocations from all section contributions of this object file and
	// mark the relocation target as address-taken.
	static void markSymbolsWithRelocations(ObjFile *file,
	SymbolRVASet &usedSymbols) {
	for (Chunk *c : file->getChunks()) {
	// We only care about live section chunks. Common chunks and other chunks
	// don't generally contain relocations.
	SectionChunk *sc = dyn_cast<SectionChunk>(c);
	if (!sc \|\| !sc->live)
	continue;

	for (const coff_relocation &reloc : sc->getRelocs()) {
	if (config->machine == I386 && reloc.Type == COFF::IMAGE_REL_I386_REL32)
	// Ignore relative relocations on x86. On x86_64 they can't be ignored
	// since they're also used to compute absolute addresses.
	continue;

	Symbol *ref = sc->file->getSymbol(reloc.SymbolTableIndex);
	maybeAddAddressTakenFunction(usedSymbols, ref);
	}
	}
	}

	// Create the guard function id table. This is a table of RVAs of all
	// address-taken functions. It is sorted and uniqued, just like the safe SEH
	// table.
	void Writer::createGuardCFTables() {
	SymbolRVASet addressTakenSyms;
	SymbolRVASet longJmpTargets;
	for (ObjFile *file : ObjFile::instances) {
	// If the object was compiled with /guard:cf, the address taken symbols
	// are in .gfids$y sections, and the longjmp targets are in .gljmp$y
	// sections. If the object was not compiled with /guard:cf, we assume there
	// were no setjmp targets, and that all code symbols with relocations are
	// possibly address-taken.
	if (file->hasGuardCF()) {
	markSymbolsForRVATable(file, file->getGuardFidChunks(), addressTakenSyms);
	markSymbolsForRVATable(file, file->getGuardLJmpChunks(), longJmpTargets);
	} else {
	markSymbolsWithRelocations(file, addressTakenSyms);
	}
	}

	// Mark the image entry as address-taken.
	if (config->entry)
	maybeAddAddressTakenFunction(addressTakenSyms, config->entry);

	// Mark exported symbols in executable sections as address-taken.
	for (Export &e : config->exports)
	maybeAddAddressTakenFunction(addressTakenSyms, e.sym);

	// Ensure sections referenced in the gfid table are 16-byte aligned.
	for (const ChunkAndOffset &c : addressTakenSyms)
	if (c.inputChunk->getAlignment() < 16)
	c.inputChunk->setAlignment(16);

	maybeAddRVATable(std::move(addressTakenSyms), "__guard_fids_table",
	"__guard_fids_count");

	// Add the longjmp target table unless the user told us not to.
	if (config->guardCF == GuardCFLevel::Full)
	maybeAddRVATable(std::move(longJmpTargets), "__guard_longjmp_table",
	"__guard_longjmp_count");

	// Set __guard_flags, which will be used in the load config to indicate that
	// /guard:cf was enabled.
	uint32_t guardFlags = uint32_t(coff_guard_flags::CFInstrumented) \|
	uint32_t(coff_guard_flags::HasFidTable);
	if (config->guardCF == GuardCFLevel::Full)
	guardFlags \|= uint32_t(coff_guard_flags::HasLongJmpTable);
	Symbol *flagSym = symtab->findUnderscore("__guard_flags");
	cast<DefinedAbsolute>(flagSym)->setVA(guardFlags);
	}

	// Take a list of input sections containing symbol table indices and add those
	// symbols to an RVA table. The challenge is that symbol RVAs are not known and
	// depend on the table size, so we can't directly build a set of integers.
	void Writer::markSymbolsForRVATable(ObjFile *file,
	ArrayRef<SectionChunk *> symIdxChunks,
	SymbolRVASet &tableSymbols) {
	for (SectionChunk *c : symIdxChunks) {
	// Skip sections discarded by linker GC. This comes up when a .gfids section
	// is associated with something like a vtable and the vtable is discarded.
	// In this case, the associated gfids section is discarded, and we don't
	// mark the virtual member functions as address-taken by the vtable.
	if (!c->live)
	continue;

	// Validate that the contents look like symbol table indices.
	ArrayRef<uint8_t> data = c->getContents();
	if (data.size() % 4 != 0) {
	warn("ignoring " + c->getSectionName() +
	" symbol table index section in object " + toString(file));
	continue;
	}

	// Read each symbol table index and check if that symbol was included in the
	// final link. If so, add it to the table symbol set.
	ArrayRef<ulittle32_t> symIndices(
	reinterpret_cast<const ulittle32_t *>(data.data()), data.size() / 4);
	ArrayRef<Symbol *> objSymbols = file->getSymbols();
	for (uint32_t symIndex : symIndices) {
	if (symIndex >= objSymbols.size()) {
	warn("ignoring invalid symbol table index in section " +
	c->getSectionName() + " in object " + toString(file));
	continue;
	}
	if (Symbol *s = objSymbols[symIndex]) {
	if (s->isLive())
	addSymbolToRVASet(tableSymbols, cast<Defined>(s));
	}
	}
	}
	}

	// Replace the absolute table symbol with a synthetic symbol pointing to
	// tableChunk so that we can emit base relocations for it and resolve section
	// relative relocations.
	void Writer::maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
	StringRef countSym) {
	if (tableSymbols.empty())
	return;

	RVATableChunk *tableChunk = make<RVATableChunk>(std::move(tableSymbols));
	rdataSec->addChunk(tableChunk);

	Symbol *t = symtab->findUnderscore(tableSym);
	Symbol *c = symtab->findUnderscore(countSym);
	replaceSymbol<DefinedSynthetic>(t, t->getName(), tableChunk);
	cast<DefinedAbsolute>(c)->setVA(tableChunk->getSize() / 4);
	}

	// MinGW specific. Gather all relocations that are imported from a DLL even
	// though the code didn't expect it to, produce the table that the runtime
	// uses for fixing them up, and provide the synthetic symbols that the
	// runtime uses for finding the table.
	void Writer::createRuntimePseudoRelocs() {
	std::vector<RuntimePseudoReloc> rels;

	for (Chunk *c : symtab->getChunks()) {
	auto *sc = dyn_cast<SectionChunk>(c);
	if (!sc \|\| !sc->live)
	continue;
	sc->getRuntimePseudoRelocs(rels);
	}

	if (!rels.empty())
	log("Writing " + Twine(rels.size()) + " runtime pseudo relocations");
	PseudoRelocTableChunk *table = make<PseudoRelocTableChunk>(rels);
	rdataSec->addChunk(table);
	EmptyChunk *endOfList = make<EmptyChunk>();
	rdataSec->addChunk(endOfList);

	Symbol *headSym = symtab->findUnderscore("__RUNTIME_PSEUDO_RELOC_LIST__");
	Symbol *endSym = symtab->findUnderscore("__RUNTIME_PSEUDO_RELOC_LIST_END__");
	replaceSymbol<DefinedSynthetic>(headSym, headSym->getName(), table);
	replaceSymbol<DefinedSynthetic>(endSym, endSym->getName(), endOfList);
	}

	// MinGW specific.
	// The MinGW .ctors and .dtors lists have sentinels at each end;
	// a (uintptr_t)-1 at the start and a (uintptr_t)0 at the end.
	// There's a symbol pointing to the start sentinel pointer, __CTOR_LIST__
	// and __DTOR_LIST__ respectively.
	void Writer::insertCtorDtorSymbols() {
	AbsolutePointerChunk *ctorListHead = make<AbsolutePointerChunk>(-1);
	AbsolutePointerChunk *ctorListEnd = make<AbsolutePointerChunk>(0);
	AbsolutePointerChunk *dtorListHead = make<AbsolutePointerChunk>(-1);
	AbsolutePointerChunk *dtorListEnd = make<AbsolutePointerChunk>(0);
	ctorsSec->insertChunkAtStart(ctorListHead);
	ctorsSec->addChunk(ctorListEnd);
	dtorsSec->insertChunkAtStart(dtorListHead);
	dtorsSec->addChunk(dtorListEnd);

	Symbol *ctorListSym = symtab->findUnderscore("__CTOR_LIST__");
	Symbol *dtorListSym = symtab->findUnderscore("__DTOR_LIST__");
	replaceSymbol<DefinedSynthetic>(ctorListSym, ctorListSym->getName(),
	ctorListHead);
	replaceSymbol<DefinedSynthetic>(dtorListSym, dtorListSym->getName(),
	dtorListHead);
	}

	// Handles /section options to allow users to overwrite
	// section attributes.
	void Writer::setSectionPermissions() {
	for (auto &p : config->section) {
	StringRef name = p.first;
	uint32_t perm = p.second;
	for (OutputSection *sec : outputSections)
	if (sec->name == name)
	sec->setPermissions(perm);
	}
	}

	// Write section contents to a mmap'ed file.
	void Writer::writeSections() {
	// Record the number of sections to apply section index relocations
	// against absolute symbols. See applySecIdx in Chunks.cpp..
	DefinedAbsolute::numOutputSections = outputSections.size();

	uint8_t *buf = buffer->getBufferStart();
	for (OutputSection *sec : outputSections) {
	uint8_t *secBuf = buf + sec->getFileOff();
	// Fill gaps between functions in .text with INT3 instructions
	// instead of leaving as NUL bytes (which can be interpreted as
	// ADD instructions).
	if (sec->header.Characteristics & IMAGE_SCN_CNT_CODE)
	memset(secBuf, 0xCC, sec->getRawSize());
	parallelForEach(sec->chunks, [&](Chunk *c) {
	c->writeTo(secBuf + c->getRVA() - sec->getRVA());
	});
	}
	}

	void Writer::writeBuildId() {
	// There are two important parts to the build ID.
	// 1) If building with debug info, the COFF debug directory contains a
	// timestamp as well as a Guid and Age of the PDB.
	// 2) In all cases, the PE COFF file header also contains a timestamp.
	// For reproducibility, instead of a timestamp we want to use a hash of the
	// PE contents.
	if (config->debug) {
	assert(buildId && "BuildId is not set!");
	// BuildId->BuildId was filled in when the PDB was written.
	}

	// At this point the only fields in the COFF file which remain unset are the
	// "timestamp" in the COFF file header, and the ones in the coff debug
	// directory. Now we can hash the file and write that hash to the various
	// timestamp fields in the file.
	StringRef outputFileData(
	reinterpret_cast<const char *>(buffer->getBufferStart()),
	buffer->getBufferSize());

	uint32_t timestamp = config->timestamp;
	uint64_t hash = 0;
	bool generateSyntheticBuildId =
	config->mingw && config->debug && config->pdbPath.empty();

	if (config->repro \|\| generateSyntheticBuildId)
	hash = xxHash64(outputFileData);

	if (config->repro)
	timestamp = static_cast<uint32_t>(hash);

	if (generateSyntheticBuildId) {
	// For MinGW builds without a PDB file, we still generate a build id
	// to allow associating a crash dump to the executable.
	buildId->buildId->PDB70.CVSignature = OMF::Signature::PDB70;
	buildId->buildId->PDB70.Age = 1;
	memcpy(buildId->buildId->PDB70.Signature, &hash, 8);
	// xxhash only gives us 8 bytes, so put some fixed data in the other half.
	memcpy(&buildId->buildId->PDB70.Signature[8], "LLD PDB.", 8);
	}

	if (debugDirectory)
	debugDirectory->setTimeDateStamp(timestamp);

	uint8_t *buf = buffer->getBufferStart();
	buf += dosStubSize + sizeof(PEMagic);
	object::coff_file_header *coffHeader =
	reinterpret_cast<coff_file_header *>(buf);
	coffHeader->TimeDateStamp = timestamp;
	}

	// Sort .pdata section contents according to PE/COFF spec 5.5.
	void Writer::sortExceptionTable() {
	if (!firstPdata)
	return;
	// We assume .pdata contains function table entries only.
	auto bufAddr = [&](Chunk *c) {
	OutputSection *os = c->getOutputSection();
	return buffer->getBufferStart() + os->getFileOff() + c->getRVA() -
	os->getRVA();
	};
	uint8_t *begin = bufAddr(firstPdata);
	uint8_t *end = bufAddr(lastPdata) + lastPdata->getSize();
	if (config->machine == AMD64) {
	struct Entry { ulittle32_t begin, end, unwind; };
	parallelSort(
	MutableArrayRef<Entry>((Entry )begin, (Entry )end),
	[](const Entry &a, const Entry &b) { return a.begin < b.begin; });
	return;
	}
	if (config->machine == ARMNT \|\| config->machine == ARM64) {
	struct Entry { ulittle32_t begin, unwind; };
	parallelSort(
	MutableArrayRef<Entry>((Entry )begin, (Entry )end),
	[](const Entry &a, const Entry &b) { return a.begin < b.begin; });
	return;
	}
	errs() << "warning: don't know how to handle .pdata.\n";
	}

	// The CRT section contains, among other things, the array of function
	// pointers that initialize every global variable that is not trivially
	// constructed. The CRT calls them one after the other prior to invoking
	// main().
	//
	// As per C++ spec, 3.6.2/2.3,
	// "Variables with ordered initialization defined within a single
	// translation unit shall be initialized in the order of their definitions
	// in the translation unit"
	//
	// It is therefore critical to sort the chunks containing the function
	// pointers in the order that they are listed in the object file (top to
	// bottom), otherwise global objects might not be initialized in the
	// correct order.
	void Writer::sortCRTSectionChunks(std::vector<Chunk *> &chunks) {
	auto sectionChunkOrder = [](const Chunk a, const Chunk b) {
	auto sa = dyn_cast<SectionChunk>(a);
	auto sb = dyn_cast<SectionChunk>(b);
	assert(sa && sb && "Non-section chunks in CRT section!");

	StringRef sAObj = sa->file->mb.getBufferIdentifier();
	StringRef sBObj = sb->file->mb.getBufferIdentifier();

	return sAObj == sBObj && sa->getSectionNumber() < sb->getSectionNumber();
	};
	llvm::stable_sort(chunks, sectionChunkOrder);

	if (config->verbose) {
	for (auto &c : chunks) {
	auto sc = dyn_cast<SectionChunk>(c);
	log(" " + sc->file->mb.getBufferIdentifier().str() +
	", SectionID: " + Twine(sc->getSectionNumber()));
	}
	}
	}

	OutputSection *Writer::findSection(StringRef name) {
	for (OutputSection *sec : outputSections)
	if (sec->name == name)
	return sec;
	return nullptr;
	}

	uint32_t Writer::getSizeOfInitializedData() {
	uint32_t res = 0;
	for (OutputSection *s : outputSections)
	if (s->header.Characteristics & IMAGE_SCN_CNT_INITIALIZED_DATA)
	res += s->getRawSize();
	return res;
	}

	// Add base relocations to .reloc section.
	void Writer::addBaserels() {
	if (!config->relocatable)
	return;
	relocSec->chunks.clear();
	std::vector<Baserel> v;
	for (OutputSection *sec : outputSections) {
	if (sec->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE)
	continue;
	// Collect all locations for base relocations.
	for (Chunk *c : sec->chunks)
	c->getBaserels(&v);
	// Add the addresses to .reloc section.
	if (!v.empty())
	addBaserelBlocks(v);
	v.clear();
	}
	}

	// Add addresses to .reloc section. Note that addresses are grouped by page.
	void Writer::addBaserelBlocks(std::vector<Baserel> &v) {
	const uint32_t mask = ~uint32_t(pageSize - 1);
	uint32_t page = v[0].rva & mask;
	size_t i = 0, j = 1;
	for (size_t e = v.size(); j < e; ++j) {
	uint32_t p = v[j].rva & mask;
	if (p == page)
	continue;
	relocSec->addChunk(make<BaserelChunk>(page, &v[i], &v[0] + j));
	i = j;
	page = p;
	}
	if (i == j)
	return;
	relocSec->addChunk(make<BaserelChunk>(page, &v[i], &v[0] + j));
	}

	PartialSection *Writer::createPartialSection(StringRef name,
	uint32_t outChars) {
	PartialSection *&pSec = partialSections[{name, outChars}];
	if (pSec)
	return pSec;
	pSec = make<PartialSection>(name, outChars);
	return pSec;
	}

	PartialSection *Writer::findPartialSection(StringRef name, uint32_t outChars) {
	auto it = partialSections.find({name, outChars});
	if (it != partialSections.end())
	return it->second;
	return nullptr;
	}
	Index: projects/clang900-import/contrib/llvm/tools/lld/ELF/Arch/PPC.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/ELF/Arch/PPC.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/ELF/Arch/PPC.cpp (revision 351722)
	@@ -1,432 +1,441 @@
	//===- PPC.cpp ------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "OutputSections.h"
	#include "Symbols.h"
	#include "SyntheticSections.h"
	#include "Target.h"
	#include "lld/Common/ErrorHandler.h"
	#include "llvm/Support/Endian.h"

	using namespace llvm;
	using namespace llvm::support::endian;
	using namespace llvm::ELF;
	using namespace lld;
	using namespace lld::elf;

	namespace {
	class PPC final : public TargetInfo {
	public:
	PPC();
	RelExpr getRelExpr(RelType type, const Symbol &s,
	const uint8_t *loc) const override;
	RelType getDynRel(RelType type) const override;
	void writeGotHeader(uint8_t *buf) const override;
	void writePltHeader(uint8_t *buf) const override {
	llvm_unreachable("should call writePPC32GlinkSection() instead");
	}
	void writePlt(uint8_t *buf, uint64_t gotPltEntryAddr, uint64_t pltEntryAddr,
	int32_t index, unsigned relOff) const override {
	llvm_unreachable("should call writePPC32GlinkSection() instead");
	}
	void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
	bool needsThunk(RelExpr expr, RelType relocType, const InputFile *file,
	uint64_t branchAddr, const Symbol &s) const override;
	uint32_t getThunkSectionSpacing() const override;
	bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
	void relocateOne(uint8_t *loc, RelType type, uint64_t val) const override;
	RelExpr adjustRelaxExpr(RelType type, const uint8_t *data,
	RelExpr expr) const override;
	int getTlsGdRelaxSkip(RelType type) const override;
	void relaxTlsGdToIe(uint8_t *loc, RelType type, uint64_t val) const override;
	void relaxTlsGdToLe(uint8_t *loc, RelType type, uint64_t val) const override;
	void relaxTlsLdToLe(uint8_t *loc, RelType type, uint64_t val) const override;
	void relaxTlsIeToLe(uint8_t *loc, RelType type, uint64_t val) const override;
	};
	} // namespace

	static uint16_t lo(uint32_t v) { return v; }
	static uint16_t ha(uint32_t v) { return (v + 0x8000) >> 16; }

	static uint32_t readFromHalf16(const uint8_t *loc) {
	return read32(config->isLE ? loc : loc - 2);
	}

	static void writeFromHalf16(uint8_t *loc, uint32_t insn) {
	write32(config->isLE ? loc : loc - 2, insn);
	}

	void elf::writePPC32GlinkSection(uint8_t *buf, size_t numEntries) {
	// On PPC Secure PLT ABI, bl foo@plt jumps to a call stub, which loads an
	// absolute address from a specific .plt slot (usually called .got.plt on
	// other targets) and jumps there.
	//
	// a) With immediate binding (BIND_NOW), the .plt entry is resolved at load
	// time. The .glink section is not used.
	// b) With lazy binding, the .plt entry points to a `b PLTresolve`
	// instruction in .glink, filled in by PPC::writeGotPlt().

	// Write N `b PLTresolve` first.
	for (size_t i = 0; i != numEntries; ++i)
	write32(buf + 4 * i, 0x48000000 \| 4 * (numEntries - i));
	buf += 4 * numEntries;

	// Then write PLTresolve(), which has two forms: PIC and non-PIC. PLTresolve()
	// computes the PLT index (by computing the distance from the landing b to
	// itself) and calls _dl_runtime_resolve() (in glibc).
	uint32_t got = in.got->getVA();
	uint32_t glink = in.plt->getVA(); // VA of .glink
	const uint8_t *end = buf + 64;
	if (config->isPic) {
	uint32_t afterBcl = in.plt->getSize() - target->pltHeaderSize + 12;
	uint32_t gotBcl = got + 4 - (glink + afterBcl);
	write32(buf + 0, 0x3d6b0000 \| ha(afterBcl)); // addis r11,r11,1f-glink@ha
	write32(buf + 4, 0x7c0802a6); // mflr r0
	write32(buf + 8, 0x429f0005); // bcl 20,30,.+4
	write32(buf + 12, 0x396b0000 \| lo(afterBcl)); // 1: addi r11,r11,1b-.glink@l
	write32(buf + 16, 0x7d8802a6); // mflr r12
	write32(buf + 20, 0x7c0803a6); // mtlr r0
	write32(buf + 24, 0x7d6c5850); // sub r11,r11,r12
	write32(buf + 28, 0x3d8c0000 \| ha(gotBcl)); // addis 12,12,GOT+4-1b@ha
	if (ha(gotBcl) == ha(gotBcl + 4)) {
	write32(buf + 32, 0x800c0000 \| lo(gotBcl)); // lwz r0,r12,GOT+4-1b@l(r12)
	write32(buf + 36,
	0x818c0000 \| lo(gotBcl + 4)); // lwz r12,r12,GOT+8-1b@l(r12)
	} else {
	write32(buf + 32, 0x840c0000 \| lo(gotBcl)); // lwzu r0,r12,GOT+4-1b@l(r12)
	write32(buf + 36, 0x818c0000 \| 4); // lwz r12,r12,4(r12)
	}
	write32(buf + 40, 0x7c0903a6); // mtctr 0
	write32(buf + 44, 0x7c0b5a14); // add r0,11,11
	write32(buf + 48, 0x7d605a14); // add r11,0,11
	write32(buf + 52, 0x4e800420); // bctr
	buf += 56;
	} else {
	write32(buf + 0, 0x3d800000 \| ha(got + 4)); // lis r12,GOT+4@ha
	write32(buf + 4, 0x3d6b0000 \| ha(-glink)); // addis r11,r11,-Glink@ha
	if (ha(got + 4) == ha(got + 8))
	write32(buf + 8, 0x800c0000 \| lo(got + 4)); // lwz r0,GOT+4@l(r12)
	else
	write32(buf + 8, 0x840c0000 \| lo(got + 4)); // lwzu r0,GOT+4@l(r12)
	write32(buf + 12, 0x396b0000 \| lo(-glink)); // addi r11,r11,-Glink@l
	write32(buf + 16, 0x7c0903a6); // mtctr r0
	write32(buf + 20, 0x7c0b5a14); // add r0,r11,r11
	if (ha(got + 4) == ha(got + 8))
	write32(buf + 24, 0x818c0000 \| lo(got + 8)); // lwz r12,GOT+8@ha(r12)
	else
	write32(buf + 24, 0x818c0000 \| 4); // lwz r12,4(r12)
	write32(buf + 28, 0x7d605a14); // add r11,r0,r11
	write32(buf + 32, 0x4e800420); // bctr
	buf += 36;
	}

	// Pad with nop. They should not be executed.
	for (; buf < end; buf += 4)
	write32(buf, 0x60000000);
	}

	PPC::PPC() {
	gotRel = R_PPC_GLOB_DAT;
	noneRel = R_PPC_NONE;
	pltRel = R_PPC_JMP_SLOT;
	relativeRel = R_PPC_RELATIVE;
	iRelativeRel = R_PPC_IRELATIVE;
	symbolicRel = R_PPC_ADDR32;
	gotBaseSymInGotPlt = false;
	gotHeaderEntriesNum = 3;
	gotPltHeaderEntriesNum = 0;
	pltHeaderSize = 64; // size of PLTresolve in .glink
	pltEntrySize = 4;

	needsThunks = true;

	tlsModuleIndexRel = R_PPC_DTPMOD32;
	tlsOffsetRel = R_PPC_DTPREL32;
	tlsGotRel = R_PPC_TPREL32;

	defaultMaxPageSize = 65536;
	defaultImageBase = 0x10000000;

	write32(trapInstr.data(), 0x7fe00008);
	}

	void PPC::writeGotHeader(uint8_t *buf) const {
	// _GLOBAL_OFFSET_TABLE_[0] = _DYNAMIC
	// glibc stores _dl_runtime_resolve in _GLOBAL_OFFSET_TABLE_[1],
	// link_map in _GLOBAL_OFFSET_TABLE_[2].
	write32(buf, mainPart->dynamic->getVA());
	}

	void PPC::writeGotPlt(uint8_t *buf, const Symbol &s) const {
	// Address of the symbol resolver stub in .glink .
	write32(buf, in.plt->getVA() + 4 * s.pltIndex);
	}

	bool PPC::needsThunk(RelExpr expr, RelType type, const InputFile *file,
	uint64_t branchAddr, const Symbol &s) const {
	if (type != R_PPC_REL24 && type != R_PPC_PLTREL24)
	return false;
	if (s.isInPlt())
	return true;
	if (s.isUndefWeak())
	return false;
	return !(expr == R_PC && PPC::inBranchRange(type, branchAddr, s.getVA()));
	}

	uint32_t PPC::getThunkSectionSpacing() const { return 0x2000000; }

	bool PPC::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
	uint64_t offset = dst - src;
	if (type == R_PPC_REL24 \|\| type == R_PPC_PLTREL24)
	return isInt<26>(offset);
	llvm_unreachable("unsupported relocation type used in branch");
	}

	RelExpr PPC::getRelExpr(RelType type, const Symbol &s,
	const uint8_t *loc) const {
	switch (type) {
	+ case R_PPC_NONE:
	+ return R_NONE;
	+ case R_PPC_ADDR16_HA:
	+ case R_PPC_ADDR16_HI:
	+ case R_PPC_ADDR16_LO:
	+ case R_PPC_ADDR32:
	+ return R_ABS;
	case R_PPC_DTPREL16:
	case R_PPC_DTPREL16_HA:
	case R_PPC_DTPREL16_HI:
	case R_PPC_DTPREL16_LO:
	case R_PPC_DTPREL32:
	return R_DTPREL;
	case R_PPC_REL14:
	case R_PPC_REL32:
	case R_PPC_LOCAL24PC:
	case R_PPC_REL16_LO:
	case R_PPC_REL16_HI:
	case R_PPC_REL16_HA:
	return R_PC;
	case R_PPC_GOT16:
	return R_GOT_OFF;
	case R_PPC_REL24:
	return R_PLT_PC;
	case R_PPC_PLTREL24:
	return R_PPC32_PLTREL;
	case R_PPC_GOT_TLSGD16:
	return R_TLSGD_GOT;
	case R_PPC_GOT_TLSLD16:
	return R_TLSLD_GOT;
	case R_PPC_GOT_TPREL16:
	return R_GOT_OFF;
	case R_PPC_TLS:
	return R_TLSIE_HINT;
	case R_PPC_TLSGD:
	return R_TLSDESC_CALL;
	case R_PPC_TLSLD:
	return R_TLSLD_HINT;
	case R_PPC_TPREL16:
	case R_PPC_TPREL16_HA:
	case R_PPC_TPREL16_LO:
	case R_PPC_TPREL16_HI:
	return R_TLS;
	default:
	- return R_ABS;
	+ error(getErrorLocation(loc) + "unknown relocation (" + Twine(type) +
	+ ") against symbol " + toString(s));
	+ return R_NONE;
	}
	}

	RelType PPC::getDynRel(RelType type) const {
	if (type == R_PPC_ADDR32)
	return type;
	return R_PPC_NONE;
	}

	static std::pair<RelType, uint64_t> fromDTPREL(RelType type, uint64_t val) {
	uint64_t dtpBiasedVal = val - 0x8000;
	switch (type) {
	case R_PPC_DTPREL16:
	return {R_PPC64_ADDR16, dtpBiasedVal};
	case R_PPC_DTPREL16_HA:
	return {R_PPC_ADDR16_HA, dtpBiasedVal};
	case R_PPC_DTPREL16_HI:
	return {R_PPC_ADDR16_HI, dtpBiasedVal};
	case R_PPC_DTPREL16_LO:
	return {R_PPC_ADDR16_LO, dtpBiasedVal};
	case R_PPC_DTPREL32:
	return {R_PPC_ADDR32, dtpBiasedVal};
	default:
	return {type, val};
	}
	}

	void PPC::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
	RelType newType;
	std::tie(newType, val) = fromDTPREL(type, val);
	switch (newType) {
	case R_PPC_ADDR16:
	checkIntUInt(loc, val, 16, type);
	write16(loc, val);
	break;
	case R_PPC_GOT16:
	case R_PPC_GOT_TLSGD16:
	case R_PPC_GOT_TLSLD16:
	case R_PPC_GOT_TPREL16:
	case R_PPC_TPREL16:
	checkInt(loc, val, 16, type);
	write16(loc, val);
	break;
	case R_PPC_ADDR16_HA:
	case R_PPC_DTPREL16_HA:
	case R_PPC_GOT_TLSGD16_HA:
	case R_PPC_GOT_TLSLD16_HA:
	case R_PPC_GOT_TPREL16_HA:
	case R_PPC_REL16_HA:
	case R_PPC_TPREL16_HA:
	write16(loc, ha(val));
	break;
	case R_PPC_ADDR16_HI:
	case R_PPC_DTPREL16_HI:
	case R_PPC_GOT_TLSGD16_HI:
	case R_PPC_GOT_TLSLD16_HI:
	case R_PPC_GOT_TPREL16_HI:
	case R_PPC_REL16_HI:
	case R_PPC_TPREL16_HI:
	write16(loc, val >> 16);
	break;
	case R_PPC_ADDR16_LO:
	case R_PPC_DTPREL16_LO:
	case R_PPC_GOT_TLSGD16_LO:
	case R_PPC_GOT_TLSLD16_LO:
	case R_PPC_GOT_TPREL16_LO:
	case R_PPC_REL16_LO:
	case R_PPC_TPREL16_LO:
	write16(loc, val);
	break;
	case R_PPC_ADDR32:
	case R_PPC_REL32:
	write32(loc, val);
	break;
	case R_PPC_REL14: {
	uint32_t mask = 0x0000FFFC;
	checkInt(loc, val, 16, type);
	checkAlignment(loc, val, 4, type);
	write32(loc, (read32(loc) & ~mask) \| (val & mask));
	break;
	}
	case R_PPC_REL24:
	case R_PPC_LOCAL24PC:
	case R_PPC_PLTREL24: {
	uint32_t mask = 0x03FFFFFC;
	checkInt(loc, val, 26, type);
	checkAlignment(loc, val, 4, type);
	write32(loc, (read32(loc) & ~mask) \| (val & mask));
	break;
	}
	default:
	- error(getErrorLocation(loc) + "unrecognized relocation " + toString(type));
	+ llvm_unreachable("unknown relocation");
	}
	}

	RelExpr PPC::adjustRelaxExpr(RelType type, const uint8_t *data,
	RelExpr expr) const {
	if (expr == R_RELAX_TLS_GD_TO_IE)
	return R_RELAX_TLS_GD_TO_IE_GOT_OFF;
	if (expr == R_RELAX_TLS_LD_TO_LE)
	return R_RELAX_TLS_LD_TO_LE_ABS;
	return expr;
	}

	int PPC::getTlsGdRelaxSkip(RelType type) const {
	// A __tls_get_addr call instruction is marked with 2 relocations:
	//
	// R_PPC_TLSGD / R_PPC_TLSLD: marker relocation
	// R_PPC_REL24: __tls_get_addr
	//
	// After the relaxation we no longer call __tls_get_addr and should skip both
	// relocations to not create a false dependence on __tls_get_addr being
	// defined.
	if (type == R_PPC_TLSGD \|\| type == R_PPC_TLSLD)
	return 2;
	return 1;
	}

	void PPC::relaxTlsGdToIe(uint8_t *loc, RelType type, uint64_t val) const {
	switch (type) {
	case R_PPC_GOT_TLSGD16: {
	// addi rT, rA, x@got@tlsgd --> lwz rT, x@got@tprel(rA)
	uint32_t insn = readFromHalf16(loc);
	writeFromHalf16(loc, 0x80000000 \| (insn & 0x03ff0000));
	relocateOne(loc, R_PPC_GOT_TPREL16, val);
	break;
	}
	case R_PPC_TLSGD:
	// bl __tls_get_addr(x@tldgd) --> add r3, r3, r2
	write32(loc, 0x7c631214);
	break;
	default:
	llvm_unreachable("unsupported relocation for TLS GD to IE relaxation");
	}
	}

	void PPC::relaxTlsGdToLe(uint8_t *loc, RelType type, uint64_t val) const {
	switch (type) {
	case R_PPC_GOT_TLSGD16:
	// addi r3, r31, x@got@tlsgd --> addis r3, r2, x@tprel@ha
	writeFromHalf16(loc, 0x3c620000 \| ha(val));
	break;
	case R_PPC_TLSGD:
	// bl __tls_get_addr(x@tldgd) --> add r3, r3, x@tprel@l
	write32(loc, 0x38630000 \| lo(val));
	break;
	default:
	llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
	}
	}

	void PPC::relaxTlsLdToLe(uint8_t *loc, RelType type, uint64_t val) const {
	switch (type) {
	case R_PPC_GOT_TLSLD16:
	// addi r3, rA, x@got@tlsgd --> addis r3, r2, 0
	writeFromHalf16(loc, 0x3c620000);
	break;
	case R_PPC_TLSLD:
	// r3+x@dtprel computes r3+x-0x8000, while we want it to compute r3+x@tprel
	// = r3+x-0x7000, so add 4096 to r3.
	// bl __tls_get_addr(x@tlsld) --> addi r3, r3, 4096
	write32(loc, 0x38631000);
	break;
	case R_PPC_DTPREL16:
	case R_PPC_DTPREL16_HA:
	case R_PPC_DTPREL16_HI:
	case R_PPC_DTPREL16_LO:
	relocateOne(loc, type, val);
	break;
	default:
	llvm_unreachable("unsupported relocation for TLS LD to LE relaxation");
	}
	}

	void PPC::relaxTlsIeToLe(uint8_t *loc, RelType type, uint64_t val) const {
	switch (type) {
	case R_PPC_GOT_TPREL16: {
	// lwz rT, x@got@tprel(rA) --> addis rT, r2, x@tprel@ha
	uint32_t rt = readFromHalf16(loc) & 0x03e00000;
	writeFromHalf16(loc, 0x3c020000 \| rt \| ha(val));
	break;
	}
	case R_PPC_TLS: {
	uint32_t insn = read32(loc);
	if (insn >> 26 != 31)
	error("unrecognized instruction for IE to LE R_PPC_TLS");
	// addi rT, rT, x@tls --> addi rT, rT, x@tprel@l
	uint32_t dFormOp = getPPCDFormOp((read32(loc) & 0x000007fe) >> 1);
	if (dFormOp == 0)
	error("unrecognized instruction for IE to LE R_PPC_TLS");
	write32(loc, (dFormOp << 26) \| (insn & 0x03ff0000) \| lo(val));
	break;
	}
	default:
	llvm_unreachable("unsupported relocation for TLS IE to LE relaxation");
	}
	}

	TargetInfo *elf::getPPCTargetInfo() {
	static PPC target;
	return &target;
	}
	Index: projects/clang900-import/contrib/llvm/tools/lld/ELF/Arch/PPC64.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/ELF/Arch/PPC64.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/ELF/Arch/PPC64.cpp (revision 351722)
	@@ -1,1077 +1,1095 @@
	//===- PPC64.cpp ----------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "Symbols.h"
	#include "SyntheticSections.h"
	#include "Target.h"
	#include "lld/Common/ErrorHandler.h"
	#include "llvm/Support/Endian.h"

	using namespace llvm;
	using namespace llvm::object;
	using namespace llvm::support::endian;
	using namespace llvm::ELF;
	using namespace lld;
	using namespace lld::elf;

	static uint64_t ppc64TocOffset = 0x8000;
	static uint64_t dynamicThreadPointerOffset = 0x8000;

	// The instruction encoding of bits 21-30 from the ISA for the Xform and Dform
	// instructions that can be used as part of the initial exec TLS sequence.
	enum XFormOpcd {
	LBZX = 87,
	LHZX = 279,
	LWZX = 23,
	LDX = 21,
	STBX = 215,
	STHX = 407,
	STWX = 151,
	STDX = 149,
	ADD = 266,
	};

	enum DFormOpcd {
	LBZ = 34,
	LBZU = 35,
	LHZ = 40,
	LHZU = 41,
	LHAU = 43,
	LWZ = 32,
	LWZU = 33,
	LFSU = 49,
	LD = 58,
	LFDU = 51,
	STB = 38,
	STBU = 39,
	STH = 44,
	STHU = 45,
	STW = 36,
	STWU = 37,
	STFSU = 53,
	STFDU = 55,
	STD = 62,
	ADDI = 14
	};

	uint64_t elf::getPPC64TocBase() {
	// The TOC consists of sections .got, .toc, .tocbss, .plt in that order. The
	// TOC starts where the first of these sections starts. We always create a
	// .got when we see a relocation that uses it, so for us the start is always
	// the .got.
	uint64_t tocVA = in.got->getVA();

	// Per the ppc64-elf-linux ABI, The TOC base is TOC value plus 0x8000
	// thus permitting a full 64 Kbytes segment. Note that the glibc startup
	// code (crt1.o) assumes that you can get from the TOC base to the
	// start of the .toc section with only a single (signed) 16-bit relocation.
	return tocVA + ppc64TocOffset;
	}

	unsigned elf::getPPC64GlobalEntryToLocalEntryOffset(uint8_t stOther) {
	// The offset is encoded into the 3 most significant bits of the st_other
	// field, with some special values described in section 3.4.1 of the ABI:
	// 0 --> Zero offset between the GEP and LEP, and the function does NOT use
	// the TOC pointer (r2). r2 will hold the same value on returning from
	// the function as it did on entering the function.
	// 1 --> Zero offset between the GEP and LEP, and r2 should be treated as a
	// caller-saved register for all callers.
	// 2-6 --> The binary logarithm of the offset eg:
	// 2 --> 2^2 = 4 bytes --> 1 instruction.
	// 6 --> 2^6 = 64 bytes --> 16 instructions.
	// 7 --> Reserved.
	uint8_t gepToLep = (stOther >> 5) & 7;
	if (gepToLep < 2)
	return 0;

	// The value encoded in the st_other bits is the
	// log-base-2(offset).
	if (gepToLep < 7)
	return 1 << gepToLep;

	error("reserved value of 7 in the 3 most-significant-bits of st_other");
	return 0;
	}

	bool elf::isPPC64SmallCodeModelTocReloc(RelType type) {
	// The only small code model relocations that access the .toc section.
	return type == R_PPC64_TOC16 \|\| type == R_PPC64_TOC16_DS;
	}

	// Find the R_PPC64_ADDR64 in .rela.toc with matching offset.
	template <typename ELFT>
	static std::pair<Defined *, int64_t>
	getRelaTocSymAndAddend(InputSectionBase *tocSec, uint64_t offset) {
	if (tocSec->numRelocations == 0)
	return {};

	// .rela.toc contains exclusively R_PPC64_ADDR64 relocations sorted by
	// r_offset: 0, 8, 16, etc. For a given Offset, Offset / 8 gives us the
	// relocation index in most cases.
	//
	// In rare cases a TOC entry may store a constant that doesn't need an
	// R_PPC64_ADDR64, the corresponding r_offset is therefore missing. Offset / 8
	// points to a relocation with larger r_offset. Do a linear probe then.
	// Constants are extremely uncommon in .toc and the extra number of array
	// accesses can be seen as a small constant.
	ArrayRef<typename ELFT::Rela> relas = tocSec->template relas<ELFT>();
	uint64_t index = std::min<uint64_t>(offset / 8, relas.size() - 1);
	for (;;) {
	if (relas[index].r_offset == offset) {
	Symbol &sym = tocSec->getFile<ELFT>()->getRelocTargetSym(relas[index]);
	return {dyn_cast<Defined>(&sym), getAddend<ELFT>(relas[index])};
	}
	if (relas[index].r_offset < offset \|\| index == 0)
	break;
	--index;
	}
	return {};
	}

	// When accessing a symbol defined in another translation unit, compilers
	// reserve a .toc entry, allocate a local label and generate toc-indirect
	// instuctions:
	//
	// addis 3, 2, .LC0@toc@ha # R_PPC64_TOC16_HA
	// ld 3, .LC0@toc@l(3) # R_PPC64_TOC16_LO_DS, load the address from a .toc entry
	// ld/lwa 3, 0(3) # load the value from the address
	//
	// .section .toc,"aw",@progbits
	// .LC0: .tc var[TC],var
	//
	// If var is defined, non-preemptable and addressable with a 32-bit signed
	// offset from the toc base, the address of var can be computed by adding an
	// offset to the toc base, saving a load.
	//
	// addis 3,2,var@toc@ha # this may be relaxed to a nop,
	// addi 3,3,var@toc@l # then this becomes addi 3,2,var@toc
	// ld/lwa 3, 0(3) # load the value from the address
	//
	// Returns true if the relaxation is performed.
	bool elf::tryRelaxPPC64TocIndirection(RelType type, const Relocation &rel,
	uint8_t *bufLoc) {
	assert(config->tocOptimize);
	if (rel.addend < 0)
	return false;

	// If the symbol is not the .toc section, this isn't a toc-indirection.
	Defined *defSym = dyn_cast<Defined>(rel.sym);
	if (!defSym \|\| !defSym->isSection() \|\| defSym->section->name != ".toc")
	return false;

	Defined *d;
	int64_t addend;
	auto *tocISB = cast<InputSectionBase>(defSym->section);
	std::tie(d, addend) =
	config->isLE ? getRelaTocSymAndAddend<ELF64LE>(tocISB, rel.addend)
	: getRelaTocSymAndAddend<ELF64BE>(tocISB, rel.addend);

	// Only non-preemptable defined symbols can be relaxed.
	if (!d \|\| d->isPreemptible)
	return false;

	// Two instructions can materialize a 32-bit signed offset from the toc base.
	uint64_t tocRelative = d->getVA(addend) - getPPC64TocBase();
	if (!isInt<32>(tocRelative))
	return false;

	// Add PPC64TocOffset that will be subtracted by relocateOne().
	target->relaxGot(bufLoc, type, tocRelative + ppc64TocOffset);
	return true;
	}

	namespace {
	class PPC64 final : public TargetInfo {
	public:
	PPC64();
	int getTlsGdRelaxSkip(RelType type) const override;
	uint32_t calcEFlags() const override;
	RelExpr getRelExpr(RelType type, const Symbol &s,
	const uint8_t *loc) const override;
	RelType getDynRel(RelType type) const override;
	void writePltHeader(uint8_t *buf) const override;
	void writePlt(uint8_t *buf, uint64_t gotPltEntryAddr, uint64_t pltEntryAddr,
	int32_t index, unsigned relOff) const override;
	void relocateOne(uint8_t *loc, RelType type, uint64_t val) const override;
	void writeGotHeader(uint8_t *buf) const override;
	bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
	uint64_t branchAddr, const Symbol &s) const override;
	uint32_t getThunkSectionSpacing() const override;
	bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
	RelExpr adjustRelaxExpr(RelType type, const uint8_t *data,
	RelExpr expr) const override;
	void relaxGot(uint8_t *loc, RelType type, uint64_t val) const override;
	void relaxTlsGdToIe(uint8_t *loc, RelType type, uint64_t val) const override;
	void relaxTlsGdToLe(uint8_t *loc, RelType type, uint64_t val) const override;
	void relaxTlsLdToLe(uint8_t *loc, RelType type, uint64_t val) const override;
	void relaxTlsIeToLe(uint8_t *loc, RelType type, uint64_t val) const override;

	bool adjustPrologueForCrossSplitStack(uint8_t loc, uint8_t end,
	uint8_t stOther) const override;
	};
	} // namespace

	// Relocation masks following the #lo(value), #hi(value), #ha(value),
	// #higher(value), #highera(value), #highest(value), and #highesta(value)
	// macros defined in section 4.5.1. Relocation Types of the PPC-elf64abi
	// document.
	static uint16_t lo(uint64_t v) { return v; }
	static uint16_t hi(uint64_t v) { return v >> 16; }
	static uint16_t ha(uint64_t v) { return (v + 0x8000) >> 16; }
	static uint16_t higher(uint64_t v) { return v >> 32; }
	static uint16_t highera(uint64_t v) { return (v + 0x8000) >> 32; }
	static uint16_t highest(uint64_t v) { return v >> 48; }
	static uint16_t highesta(uint64_t v) { return (v + 0x8000) >> 48; }

	// Extracts the 'PO' field of an instruction encoding.
	static uint8_t getPrimaryOpCode(uint32_t encoding) { return (encoding >> 26); }

	static bool isDQFormInstruction(uint32_t encoding) {
	switch (getPrimaryOpCode(encoding)) {
	default:
	return false;
	case 56:
	// The only instruction with a primary opcode of 56 is `lq`.
	return true;
	case 61:
	// There are both DS and DQ instruction forms with this primary opcode.
	// Namely `lxv` and `stxv` are the DQ-forms that use it.
	// The DS 'XO' bits being set to 01 is restricted to DQ form.
	return (encoding & 3) == 0x1;
	}
	}

	static bool isInstructionUpdateForm(uint32_t encoding) {
	switch (getPrimaryOpCode(encoding)) {
	default:
	return false;
	case LBZU:
	case LHAU:
	case LHZU:
	case LWZU:
	case LFSU:
	case LFDU:
	case STBU:
	case STHU:
	case STWU:
	case STFSU:
	case STFDU:
	return true;
	// LWA has the same opcode as LD, and the DS bits is what differentiates
	// between LD/LDU/LWA
	case LD:
	case STD:
	return (encoding & 3) == 1;
	}
	}

	// There are a number of places when we either want to read or write an
	// instruction when handling a half16 relocation type. On big-endian the buffer
	// pointer is pointing into the middle of the word we want to extract, and on
	// little-endian it is pointing to the start of the word. These 2 helpers are to
	// simplify reading and writing in that context.
	static void writeFromHalf16(uint8_t *loc, uint32_t insn) {
	write32(config->isLE ? loc : loc - 2, insn);
	}

	static uint32_t readFromHalf16(const uint8_t *loc) {
	return read32(config->isLE ? loc : loc - 2);
	}

	PPC64::PPC64() {
	gotRel = R_PPC64_GLOB_DAT;
	noneRel = R_PPC64_NONE;
	pltRel = R_PPC64_JMP_SLOT;
	relativeRel = R_PPC64_RELATIVE;
	iRelativeRel = R_PPC64_IRELATIVE;
	symbolicRel = R_PPC64_ADDR64;
	pltEntrySize = 4;
	gotBaseSymInGotPlt = false;
	gotHeaderEntriesNum = 1;
	gotPltHeaderEntriesNum = 2;
	pltHeaderSize = 60;
	needsThunks = true;

	tlsModuleIndexRel = R_PPC64_DTPMOD64;
	tlsOffsetRel = R_PPC64_DTPREL64;

	tlsGotRel = R_PPC64_TPREL64;

	needsMoreStackNonSplit = false;

	// We need 64K pages (at least under glibc/Linux, the loader won't
	// set different permissions on a finer granularity than that).
	defaultMaxPageSize = 65536;

	// The PPC64 ELF ABI v1 spec, says:
	//
	// It is normally desirable to put segments with different characteristics
	// in separate 256 Mbyte portions of the address space, to give the
	// operating system full paging flexibility in the 64-bit address space.
	//
	// And because the lowest non-zero 256M boundary is 0x10000000, PPC64 linkers
	// use 0x10000000 as the starting address.
	defaultImageBase = 0x10000000;

	write32(trapInstr.data(), 0x7fe00008);
	}

	int PPC64::getTlsGdRelaxSkip(RelType type) const {
	// A __tls_get_addr call instruction is marked with 2 relocations:
	//
	// R_PPC64_TLSGD / R_PPC64_TLSLD: marker relocation
	// R_PPC64_REL24: __tls_get_addr
	//
	// After the relaxation we no longer call __tls_get_addr and should skip both
	// relocations to not create a false dependence on __tls_get_addr being
	// defined.
	if (type == R_PPC64_TLSGD \|\| type == R_PPC64_TLSLD)
	return 2;
	return 1;
	}

	static uint32_t getEFlags(InputFile *file) {
	if (config->ekind == ELF64BEKind)
	return cast<ObjFile<ELF64BE>>(file)->getObj().getHeader()->e_flags;
	return cast<ObjFile<ELF64LE>>(file)->getObj().getHeader()->e_flags;
	}

	// This file implements v2 ABI. This function makes sure that all
	// object files have v2 or an unspecified version as an ABI version.
	uint32_t PPC64::calcEFlags() const {
	for (InputFile *f : objectFiles) {
	uint32_t flag = getEFlags(f);
	if (flag == 1)
	error(toString(f) + ": ABI version 1 is not supported");
	else if (flag > 2)
	error(toString(f) + ": unrecognized e_flags: " + Twine(flag));
	}
	return 2;
	}

	void PPC64::relaxGot(uint8_t *loc, RelType type, uint64_t val) const {
	switch (type) {
	case R_PPC64_TOC16_HA:
	// Convert "addis reg, 2, .LC0@toc@h" to "addis reg, 2, var@toc@h" or "nop".
	relocateOne(loc, type, val);
	break;
	case R_PPC64_TOC16_LO_DS: {
	// Convert "ld reg, .LC0@toc@l(reg)" to "addi reg, reg, var@toc@l" or
	// "addi reg, 2, var@toc".
	uint32_t insn = readFromHalf16(loc);
	if (getPrimaryOpCode(insn) != LD)
	error("expected a 'ld' for got-indirect to toc-relative relaxing");
	writeFromHalf16(loc, (insn & 0x03ffffff) \| 0x38000000);
	relocateOne(loc, R_PPC64_TOC16_LO, val);
	break;
	}
	default:
	llvm_unreachable("unexpected relocation type");
	}
	}

	void PPC64::relaxTlsGdToLe(uint8_t *loc, RelType type, uint64_t val) const {
	// Reference: 3.7.4.2 of the 64-bit ELF V2 abi supplement.
	// The general dynamic code sequence for a global `x` will look like:
	// Instruction Relocation Symbol
	// addis r3, r2, x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
	// addi r3, r3, x@got@tlsgd@l R_PPC64_GOT_TLSGD16_LO x
	// bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
	// R_PPC64_REL24 __tls_get_addr
	// nop None None

	// Relaxing to local exec entails converting:
	// addis r3, r2, x@got@tlsgd@ha into nop
	// addi r3, r3, x@got@tlsgd@l into addis r3, r13, x@tprel@ha
	// bl __tls_get_addr(x@tlsgd) into nop
	// nop into addi r3, r3, x@tprel@l

	switch (type) {
	case R_PPC64_GOT_TLSGD16_HA:
	writeFromHalf16(loc, 0x60000000); // nop
	break;
	case R_PPC64_GOT_TLSGD16:
	case R_PPC64_GOT_TLSGD16_LO:
	writeFromHalf16(loc, 0x3c6d0000); // addis r3, r13
	relocateOne(loc, R_PPC64_TPREL16_HA, val);
	break;
	case R_PPC64_TLSGD:
	write32(loc, 0x60000000); // nop
	write32(loc + 4, 0x38630000); // addi r3, r3
	// Since we are relocating a half16 type relocation and Loc + 4 points to
	// the start of an instruction we need to advance the buffer by an extra
	// 2 bytes on BE.
	relocateOne(loc + 4 + (config->ekind == ELF64BEKind ? 2 : 0),
	R_PPC64_TPREL16_LO, val);
	break;
	default:
	llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
	}
	}

	void PPC64::relaxTlsLdToLe(uint8_t *loc, RelType type, uint64_t val) const {
	// Reference: 3.7.4.3 of the 64-bit ELF V2 abi supplement.
	// The local dynamic code sequence for a global `x` will look like:
	// Instruction Relocation Symbol
	// addis r3, r2, x@got@tlsld@ha R_PPC64_GOT_TLSLD16_HA x
	// addi r3, r3, x@got@tlsld@l R_PPC64_GOT_TLSLD16_LO x
	// bl __tls_get_addr(x@tlsgd) R_PPC64_TLSLD x
	// R_PPC64_REL24 __tls_get_addr
	// nop None None

	// Relaxing to local exec entails converting:
	// addis r3, r2, x@got@tlsld@ha into nop
	// addi r3, r3, x@got@tlsld@l into addis r3, r13, 0
	// bl __tls_get_addr(x@tlsgd) into nop
	// nop into addi r3, r3, 4096

	switch (type) {
	case R_PPC64_GOT_TLSLD16_HA:
	writeFromHalf16(loc, 0x60000000); // nop
	break;
	case R_PPC64_GOT_TLSLD16_LO:
	writeFromHalf16(loc, 0x3c6d0000); // addis r3, r13, 0
	break;
	case R_PPC64_TLSLD:
	write32(loc, 0x60000000); // nop
	write32(loc + 4, 0x38631000); // addi r3, r3, 4096
	break;
	case R_PPC64_DTPREL16:
	case R_PPC64_DTPREL16_HA:
	case R_PPC64_DTPREL16_HI:
	case R_PPC64_DTPREL16_DS:
	case R_PPC64_DTPREL16_LO:
	case R_PPC64_DTPREL16_LO_DS:
	relocateOne(loc, type, val);
	break;
	default:
	llvm_unreachable("unsupported relocation for TLS LD to LE relaxation");
	}
	}

	unsigned elf::getPPCDFormOp(unsigned secondaryOp) {
	switch (secondaryOp) {
	case LBZX:
	return LBZ;
	case LHZX:
	return LHZ;
	case LWZX:
	return LWZ;
	case LDX:
	return LD;
	case STBX:
	return STB;
	case STHX:
	return STH;
	case STWX:
	return STW;
	case STDX:
	return STD;
	case ADD:
	return ADDI;
	default:
	return 0;
	}
	}

	void PPC64::relaxTlsIeToLe(uint8_t *loc, RelType type, uint64_t val) const {
	// The initial exec code sequence for a global `x` will look like:
	// Instruction Relocation Symbol
	// addis r9, r2, x@got@tprel@ha R_PPC64_GOT_TPREL16_HA x
	// ld r9, x@got@tprel@l(r9) R_PPC64_GOT_TPREL16_LO_DS x
	// add r9, r9, x@tls R_PPC64_TLS x

	// Relaxing to local exec entails converting:
	// addis r9, r2, x@got@tprel@ha into nop
	// ld r9, x@got@tprel@l(r9) into addis r9, r13, x@tprel@ha
	// add r9, r9, x@tls into addi r9, r9, x@tprel@l

	// x@tls R_PPC64_TLS is a relocation which does not compute anything,
	// it is replaced with r13 (thread pointer).

	// The add instruction in the initial exec sequence has multiple variations
	// that need to be handled. If we are building an address it will use an add
	// instruction, if we are accessing memory it will use any of the X-form
	// indexed load or store instructions.

	unsigned offset = (config->ekind == ELF64BEKind) ? 2 : 0;
	switch (type) {
	case R_PPC64_GOT_TPREL16_HA:
	write32(loc - offset, 0x60000000); // nop
	break;
	case R_PPC64_GOT_TPREL16_LO_DS:
	case R_PPC64_GOT_TPREL16_DS: {
	uint32_t regNo = read32(loc - offset) & 0x03E00000; // bits 6-10
	write32(loc - offset, 0x3C0D0000 \| regNo); // addis RegNo, r13
	relocateOne(loc, R_PPC64_TPREL16_HA, val);
	break;
	}
	case R_PPC64_TLS: {
	uint32_t primaryOp = getPrimaryOpCode(read32(loc));
	if (primaryOp != 31)
	error("unrecognized instruction for IE to LE R_PPC64_TLS");
	uint32_t secondaryOp = (read32(loc) & 0x000007FE) >> 1; // bits 21-30
	uint32_t dFormOp = getPPCDFormOp(secondaryOp);
	if (dFormOp == 0)
	error("unrecognized instruction for IE to LE R_PPC64_TLS");
	write32(loc, ((dFormOp << 26) \| (read32(loc) & 0x03FFFFFF)));
	relocateOne(loc + offset, R_PPC64_TPREL16_LO, val);
	break;
	}
	default:
	llvm_unreachable("unknown relocation for IE to LE");
	break;
	}
	}

	RelExpr PPC64::getRelExpr(RelType type, const Symbol &s,
	const uint8_t *loc) const {
	switch (type) {
	+ case R_PPC64_NONE:
	+ return R_NONE;
	+ case R_PPC64_ADDR16:
	+ case R_PPC64_ADDR16_DS:
	+ case R_PPC64_ADDR16_HA:
	+ case R_PPC64_ADDR16_HI:
	+ case R_PPC64_ADDR16_HIGHER:
	+ case R_PPC64_ADDR16_HIGHERA:
	+ case R_PPC64_ADDR16_HIGHEST:
	+ case R_PPC64_ADDR16_HIGHESTA:
	+ case R_PPC64_ADDR16_LO:
	+ case R_PPC64_ADDR16_LO_DS:
	+ case R_PPC64_ADDR32:
	+ case R_PPC64_ADDR64:
	+ return R_ABS;
	case R_PPC64_GOT16:
	case R_PPC64_GOT16_DS:
	case R_PPC64_GOT16_HA:
	case R_PPC64_GOT16_HI:
	case R_PPC64_GOT16_LO:
	case R_PPC64_GOT16_LO_DS:
	return R_GOT_OFF;
	case R_PPC64_TOC16:
	case R_PPC64_TOC16_DS:
	case R_PPC64_TOC16_HI:
	case R_PPC64_TOC16_LO:
	return R_GOTREL;
	case R_PPC64_TOC16_HA:
	case R_PPC64_TOC16_LO_DS:
	return config->tocOptimize ? R_PPC64_RELAX_TOC : R_GOTREL;
	case R_PPC64_TOC:
	return R_PPC64_TOCBASE;
	case R_PPC64_REL14:
	case R_PPC64_REL24:
	return R_PPC64_CALL_PLT;
	case R_PPC64_REL16_LO:
	case R_PPC64_REL16_HA:
	+ case R_PPC64_REL16_HI:
	case R_PPC64_REL32:
	case R_PPC64_REL64:
	return R_PC;
	case R_PPC64_GOT_TLSGD16:
	case R_PPC64_GOT_TLSGD16_HA:
	case R_PPC64_GOT_TLSGD16_HI:
	case R_PPC64_GOT_TLSGD16_LO:
	return R_TLSGD_GOT;
	case R_PPC64_GOT_TLSLD16:
	case R_PPC64_GOT_TLSLD16_HA:
	case R_PPC64_GOT_TLSLD16_HI:
	case R_PPC64_GOT_TLSLD16_LO:
	return R_TLSLD_GOT;
	case R_PPC64_GOT_TPREL16_HA:
	case R_PPC64_GOT_TPREL16_LO_DS:
	case R_PPC64_GOT_TPREL16_DS:
	case R_PPC64_GOT_TPREL16_HI:
	return R_GOT_OFF;
	case R_PPC64_GOT_DTPREL16_HA:
	case R_PPC64_GOT_DTPREL16_LO_DS:
	case R_PPC64_GOT_DTPREL16_DS:
	case R_PPC64_GOT_DTPREL16_HI:
	return R_TLSLD_GOT_OFF;
	case R_PPC64_TPREL16:
	case R_PPC64_TPREL16_HA:
	case R_PPC64_TPREL16_LO:
	case R_PPC64_TPREL16_HI:
	case R_PPC64_TPREL16_DS:
	case R_PPC64_TPREL16_LO_DS:
	case R_PPC64_TPREL16_HIGHER:
	case R_PPC64_TPREL16_HIGHERA:
	case R_PPC64_TPREL16_HIGHEST:
	case R_PPC64_TPREL16_HIGHESTA:
	return R_TLS;
	case R_PPC64_DTPREL16:
	case R_PPC64_DTPREL16_DS:
	case R_PPC64_DTPREL16_HA:
	case R_PPC64_DTPREL16_HI:
	case R_PPC64_DTPREL16_HIGHER:
	case R_PPC64_DTPREL16_HIGHERA:
	case R_PPC64_DTPREL16_HIGHEST:
	case R_PPC64_DTPREL16_HIGHESTA:
	case R_PPC64_DTPREL16_LO:
	case R_PPC64_DTPREL16_LO_DS:
	case R_PPC64_DTPREL64:
	return R_DTPREL;
	case R_PPC64_TLSGD:
	return R_TLSDESC_CALL;
	case R_PPC64_TLSLD:
	return R_TLSLD_HINT;
	case R_PPC64_TLS:
	return R_TLSIE_HINT;
	default:
	- return R_ABS;
	+ error(getErrorLocation(loc) + "unknown relocation (" + Twine(type) +
	+ ") against symbol " + toString(s));
	+ return R_NONE;
	}
	}

	RelType PPC64::getDynRel(RelType type) const {
	if (type == R_PPC64_ADDR64 \|\| type == R_PPC64_TOC)
	return R_PPC64_ADDR64;
	return R_PPC64_NONE;
	}

	void PPC64::writeGotHeader(uint8_t *buf) const {
	write64(buf, getPPC64TocBase());
	}

	void PPC64::writePltHeader(uint8_t *buf) const {
	// The generic resolver stub goes first.
	write32(buf + 0, 0x7c0802a6); // mflr r0
	write32(buf + 4, 0x429f0005); // bcl 20,4*cr7+so,8 <_glink+0x8>
	write32(buf + 8, 0x7d6802a6); // mflr r11
	write32(buf + 12, 0x7c0803a6); // mtlr r0
	write32(buf + 16, 0x7d8b6050); // subf r12, r11, r12
	write32(buf + 20, 0x380cffcc); // subi r0,r12,52
	write32(buf + 24, 0x7800f082); // srdi r0,r0,62,2
	write32(buf + 28, 0xe98b002c); // ld r12,44(r11)
	write32(buf + 32, 0x7d6c5a14); // add r11,r12,r11
	write32(buf + 36, 0xe98b0000); // ld r12,0(r11)
	write32(buf + 40, 0xe96b0008); // ld r11,8(r11)
	write32(buf + 44, 0x7d8903a6); // mtctr r12
	write32(buf + 48, 0x4e800420); // bctr

	// The 'bcl' instruction will set the link register to the address of the
	// following instruction ('mflr r11'). Here we store the offset from that
	// instruction to the first entry in the GotPlt section.
	int64_t gotPltOffset = in.gotPlt->getVA() - (in.plt->getVA() + 8);
	write64(buf + 52, gotPltOffset);
	}

	void PPC64::writePlt(uint8_t *buf, uint64_t gotPltEntryAddr,
	uint64_t pltEntryAddr, int32_t index,
	unsigned relOff) const {
	int32_t offset = pltHeaderSize + index * pltEntrySize;
	// bl __glink_PLTresolve
	write32(buf, 0x48000000 \| ((-offset) & 0x03FFFFFc));
	}

	static std::pair<RelType, uint64_t> toAddr16Rel(RelType type, uint64_t val) {
	// Relocations relative to the toc-base need to be adjusted by the Toc offset.
	uint64_t tocBiasedVal = val - ppc64TocOffset;
	// Relocations relative to dtv[dtpmod] need to be adjusted by the DTP offset.
	uint64_t dtpBiasedVal = val - dynamicThreadPointerOffset;

	switch (type) {
	// TOC biased relocation.
	case R_PPC64_GOT16:
	case R_PPC64_GOT_TLSGD16:
	case R_PPC64_GOT_TLSLD16:
	case R_PPC64_TOC16:
	return {R_PPC64_ADDR16, tocBiasedVal};
	case R_PPC64_GOT16_DS:
	case R_PPC64_TOC16_DS:
	case R_PPC64_GOT_TPREL16_DS:
	case R_PPC64_GOT_DTPREL16_DS:
	return {R_PPC64_ADDR16_DS, tocBiasedVal};
	case R_PPC64_GOT16_HA:
	case R_PPC64_GOT_TLSGD16_HA:
	case R_PPC64_GOT_TLSLD16_HA:
	case R_PPC64_GOT_TPREL16_HA:
	case R_PPC64_GOT_DTPREL16_HA:
	case R_PPC64_TOC16_HA:
	return {R_PPC64_ADDR16_HA, tocBiasedVal};
	case R_PPC64_GOT16_HI:
	case R_PPC64_GOT_TLSGD16_HI:
	case R_PPC64_GOT_TLSLD16_HI:
	case R_PPC64_GOT_TPREL16_HI:
	case R_PPC64_GOT_DTPREL16_HI:
	case R_PPC64_TOC16_HI:
	return {R_PPC64_ADDR16_HI, tocBiasedVal};
	case R_PPC64_GOT16_LO:
	case R_PPC64_GOT_TLSGD16_LO:
	case R_PPC64_GOT_TLSLD16_LO:
	case R_PPC64_TOC16_LO:
	return {R_PPC64_ADDR16_LO, tocBiasedVal};
	case R_PPC64_GOT16_LO_DS:
	case R_PPC64_TOC16_LO_DS:
	case R_PPC64_GOT_TPREL16_LO_DS:
	case R_PPC64_GOT_DTPREL16_LO_DS:
	return {R_PPC64_ADDR16_LO_DS, tocBiasedVal};

	// Dynamic Thread pointer biased relocation types.
	case R_PPC64_DTPREL16:
	return {R_PPC64_ADDR16, dtpBiasedVal};
	case R_PPC64_DTPREL16_DS:
	return {R_PPC64_ADDR16_DS, dtpBiasedVal};
	case R_PPC64_DTPREL16_HA:
	return {R_PPC64_ADDR16_HA, dtpBiasedVal};
	case R_PPC64_DTPREL16_HI:
	return {R_PPC64_ADDR16_HI, dtpBiasedVal};
	case R_PPC64_DTPREL16_HIGHER:
	return {R_PPC64_ADDR16_HIGHER, dtpBiasedVal};
	case R_PPC64_DTPREL16_HIGHERA:
	return {R_PPC64_ADDR16_HIGHERA, dtpBiasedVal};
	case R_PPC64_DTPREL16_HIGHEST:
	return {R_PPC64_ADDR16_HIGHEST, dtpBiasedVal};
	case R_PPC64_DTPREL16_HIGHESTA:
	return {R_PPC64_ADDR16_HIGHESTA, dtpBiasedVal};
	case R_PPC64_DTPREL16_LO:
	return {R_PPC64_ADDR16_LO, dtpBiasedVal};
	case R_PPC64_DTPREL16_LO_DS:
	return {R_PPC64_ADDR16_LO_DS, dtpBiasedVal};
	case R_PPC64_DTPREL64:
	return {R_PPC64_ADDR64, dtpBiasedVal};

	default:
	return {type, val};
	}
	}

	static bool isTocOptType(RelType type) {
	switch (type) {
	case R_PPC64_GOT16_HA:
	case R_PPC64_GOT16_LO_DS:
	case R_PPC64_TOC16_HA:
	case R_PPC64_TOC16_LO_DS:
	case R_PPC64_TOC16_LO:
	return true;
	default:
	return false;
	}
	}

	void PPC64::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
	// We need to save the original relocation type to use in diagnostics, and
	// use the original type to determine if we should toc-optimize the
	// instructions being relocated.
	RelType originalType = type;
	bool shouldTocOptimize = isTocOptType(type);
	// For dynamic thread pointer relative, toc-relative, and got-indirect
	// relocations, proceed in terms of the corresponding ADDR16 relocation type.
	std::tie(type, val) = toAddr16Rel(type, val);

	switch (type) {
	case R_PPC64_ADDR14: {
	checkAlignment(loc, val, 4, type);
	// Preserve the AA/LK bits in the branch instruction
	uint8_t aalk = loc[3];
	write16(loc + 2, (aalk & 3) \| (val & 0xfffc));
	break;
	}
	case R_PPC64_ADDR16:
	checkIntUInt(loc, val, 16, originalType);
	write16(loc, val);
	break;
	case R_PPC64_ADDR32:
	checkIntUInt(loc, val, 32, originalType);
	write32(loc, val);
	break;
	case R_PPC64_ADDR16_DS:
	case R_PPC64_TPREL16_DS: {
	checkInt(loc, val, 16, originalType);
	// DQ-form instructions use bits 28-31 as part of the instruction encoding
	// DS-form instructions only use bits 30-31.
	uint16_t mask = isDQFormInstruction(readFromHalf16(loc)) ? 0xf : 0x3;
	checkAlignment(loc, lo(val), mask + 1, originalType);
	write16(loc, (read16(loc) & mask) \| lo(val));
	} break;
	case R_PPC64_ADDR16_HA:
	case R_PPC64_REL16_HA:
	case R_PPC64_TPREL16_HA:
	if (config->tocOptimize && shouldTocOptimize && ha(val) == 0)
	writeFromHalf16(loc, 0x60000000);
	else
	write16(loc, ha(val));
	break;
	case R_PPC64_ADDR16_HI:
	case R_PPC64_REL16_HI:
	case R_PPC64_TPREL16_HI:
	write16(loc, hi(val));
	break;
	case R_PPC64_ADDR16_HIGHER:
	case R_PPC64_TPREL16_HIGHER:
	write16(loc, higher(val));
	break;
	case R_PPC64_ADDR16_HIGHERA:
	case R_PPC64_TPREL16_HIGHERA:
	write16(loc, highera(val));
	break;
	case R_PPC64_ADDR16_HIGHEST:
	case R_PPC64_TPREL16_HIGHEST:
	write16(loc, highest(val));
	break;
	case R_PPC64_ADDR16_HIGHESTA:
	case R_PPC64_TPREL16_HIGHESTA:
	write16(loc, highesta(val));
	break;
	case R_PPC64_ADDR16_LO:
	case R_PPC64_REL16_LO:
	case R_PPC64_TPREL16_LO:
	// When the high-adjusted part of a toc relocation evalutes to 0, it is
	// changed into a nop. The lo part then needs to be updated to use the
	// toc-pointer register r2, as the base register.
	if (config->tocOptimize && shouldTocOptimize && ha(val) == 0) {
	uint32_t insn = readFromHalf16(loc);
	if (isInstructionUpdateForm(insn))
	error(getErrorLocation(loc) +
	"can't toc-optimize an update instruction: 0x" +
	utohexstr(insn));
	writeFromHalf16(loc, (insn & 0xffe00000) \| 0x00020000 \| lo(val));
	} else {
	write16(loc, lo(val));
	}
	break;
	case R_PPC64_ADDR16_LO_DS:
	case R_PPC64_TPREL16_LO_DS: {
	// DQ-form instructions use bits 28-31 as part of the instruction encoding
	// DS-form instructions only use bits 30-31.
	uint32_t insn = readFromHalf16(loc);
	uint16_t mask = isDQFormInstruction(insn) ? 0xf : 0x3;
	checkAlignment(loc, lo(val), mask + 1, originalType);
	if (config->tocOptimize && shouldTocOptimize && ha(val) == 0) {
	// When the high-adjusted part of a toc relocation evalutes to 0, it is
	// changed into a nop. The lo part then needs to be updated to use the toc
	// pointer register r2, as the base register.
	if (isInstructionUpdateForm(insn))
	error(getErrorLocation(loc) +
	"Can't toc-optimize an update instruction: 0x" +
	Twine::utohexstr(insn));
	insn &= 0xffe00000 \| mask;
	writeFromHalf16(loc, insn \| 0x00020000 \| lo(val));
	} else {
	write16(loc, (read16(loc) & mask) \| lo(val));
	}
	} break;
	case R_PPC64_TPREL16:
	checkInt(loc, val, 16, originalType);
	write16(loc, val);
	break;
	case R_PPC64_REL32:
	checkInt(loc, val, 32, type);
	write32(loc, val);
	break;
	case R_PPC64_ADDR64:
	case R_PPC64_REL64:
	case R_PPC64_TOC:
	write64(loc, val);
	break;
	case R_PPC64_REL14: {
	uint32_t mask = 0x0000FFFC;
	checkInt(loc, val, 16, type);
	checkAlignment(loc, val, 4, type);
	write32(loc, (read32(loc) & ~mask) \| (val & mask));
	break;
	}
	case R_PPC64_REL24: {
	uint32_t mask = 0x03FFFFFC;
	checkInt(loc, val, 26, type);
	checkAlignment(loc, val, 4, type);
	write32(loc, (read32(loc) & ~mask) \| (val & mask));
	break;
	}
	case R_PPC64_DTPREL64:
	write64(loc, val - dynamicThreadPointerOffset);
	break;
	default:
	- error(getErrorLocation(loc) + "unrecognized relocation " + toString(type));
	+ llvm_unreachable("unknown relocation");
	}
	}

	bool PPC64::needsThunk(RelExpr expr, RelType type, const InputFile *file,
	uint64_t branchAddr, const Symbol &s) const {
	if (type != R_PPC64_REL14 && type != R_PPC64_REL24)
	return false;

	// If a function is in the Plt it needs to be called with a call-stub.
	if (s.isInPlt())
	return true;

	// If a symbol is a weak undefined and we are compiling an executable
	// it doesn't need a range-extending thunk since it can't be called.
	if (s.isUndefWeak() && !config->shared)
	return false;

	// If the offset exceeds the range of the branch type then it will need
	// a range-extending thunk.
	// See the comment in getRelocTargetVA() about R_PPC64_CALL.
	return !inBranchRange(type, branchAddr,
	s.getVA() +
	getPPC64GlobalEntryToLocalEntryOffset(s.stOther));
	}

	uint32_t PPC64::getThunkSectionSpacing() const {
	// See comment in Arch/ARM.cpp for a more detailed explanation of
	// getThunkSectionSpacing(). For PPC64 we pick the constant here based on
	// R_PPC64_REL24, which is used by unconditional branch instructions.
	// 0x2000000 = (1 << 24-1) * 4
	return 0x2000000;
	}

	bool PPC64::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
	int64_t offset = dst - src;
	if (type == R_PPC64_REL14)
	return isInt<16>(offset);
	if (type == R_PPC64_REL24)
	return isInt<26>(offset);
	llvm_unreachable("unsupported relocation type used in branch");
	}

	RelExpr PPC64::adjustRelaxExpr(RelType type, const uint8_t *data,
	RelExpr expr) const {
	if (expr == R_RELAX_TLS_GD_TO_IE)
	return R_RELAX_TLS_GD_TO_IE_GOT_OFF;
	if (expr == R_RELAX_TLS_LD_TO_LE)
	return R_RELAX_TLS_LD_TO_LE_ABS;
	return expr;
	}

	// Reference: 3.7.4.1 of the 64-bit ELF V2 abi supplement.
	// The general dynamic code sequence for a global `x` uses 4 instructions.
	// Instruction Relocation Symbol
	// addis r3, r2, x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
	// addi r3, r3, x@got@tlsgd@l R_PPC64_GOT_TLSGD16_LO x
	// bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
	// R_PPC64_REL24 __tls_get_addr
	// nop None None
	//
	// Relaxing to initial-exec entails:
	// 1) Convert the addis/addi pair that builds the address of the tls_index
	// struct for 'x' to an addis/ld pair that loads an offset from a got-entry.
	// 2) Convert the call to __tls_get_addr to a nop.
	// 3) Convert the nop following the call to an add of the loaded offset to the
	// thread pointer.
	// Since the nop must directly follow the call, the R_PPC64_TLSGD relocation is
	// used as the relaxation hint for both steps 2 and 3.
	void PPC64::relaxTlsGdToIe(uint8_t *loc, RelType type, uint64_t val) const {
	switch (type) {
	case R_PPC64_GOT_TLSGD16_HA:
	// This is relaxed from addis rT, r2, sym@got@tlsgd@ha to
	// addis rT, r2, sym@got@tprel@ha.
	relocateOne(loc, R_PPC64_GOT_TPREL16_HA, val);
	return;
	case R_PPC64_GOT_TLSGD16:
	case R_PPC64_GOT_TLSGD16_LO: {
	// Relax from addi r3, rA, sym@got@tlsgd@l to
	// ld r3, sym@got@tprel@l(rA)
	uint32_t ra = (readFromHalf16(loc) & (0x1f << 16));
	writeFromHalf16(loc, 0xe8600000 \| ra);
	relocateOne(loc, R_PPC64_GOT_TPREL16_LO_DS, val);
	return;
	}
	case R_PPC64_TLSGD:
	write32(loc, 0x60000000); // bl __tls_get_addr(sym@tlsgd) --> nop
	write32(loc + 4, 0x7c636A14); // nop --> add r3, r3, r13
	return;
	default:
	llvm_unreachable("unsupported relocation for TLS GD to IE relaxation");
	}
	}

	// The prologue for a split-stack function is expected to look roughly
	// like this:
	// .Lglobal_entry_point:
	// # TOC pointer initalization.
	// ...
	// .Llocal_entry_point:
	// # load the __private_ss member of the threads tcbhead.
	// ld r0,-0x7000-64(r13)
	// # subtract the functions stack size from the stack pointer.
	// addis r12, r1, ha(-stack-frame size)
	// addi r12, r12, l(-stack-frame size)
	// # compare needed to actual and branch to allocate_more_stack if more
	// # space is needed, otherwise fallthrough to 'normal' function body.
	// cmpld cr7,r12,r0
	// blt- cr7, .Lallocate_more_stack
	//
	// -) The allocate_more_stack block might be placed after the split-stack
	// prologue and the `blt-` replaced with a `bge+ .Lnormal_func_body`
	// instead.
	// -) If either the addis or addi is not needed due to the stack size being
	// smaller then 32K or a multiple of 64K they will be replaced with a nop,
	// but there will always be 2 instructions the linker can overwrite for the
	// adjusted stack size.
	//
	// The linkers job here is to increase the stack size used in the addis/addi
	// pair by split-stack-size-adjust.
	// addis r12, r1, ha(-stack-frame size - split-stack-adjust-size)
	// addi r12, r12, l(-stack-frame size - split-stack-adjust-size)
	bool PPC64::adjustPrologueForCrossSplitStack(uint8_t loc, uint8_t end,
	uint8_t stOther) const {
	// If the caller has a global entry point adjust the buffer past it. The start
	// of the split-stack prologue will be at the local entry point.
	loc += getPPC64GlobalEntryToLocalEntryOffset(stOther);

	// At the very least we expect to see a load of some split-stack data from the
	// tcb, and 2 instructions that calculate the ending stack address this
	// function will require. If there is not enough room for at least 3
	// instructions it can't be a split-stack prologue.
	if (loc + 12 >= end)
	return false;

	// First instruction must be `ld r0, -0x7000-64(r13)`
	if (read32(loc) != 0xe80d8fc0)
	return false;

	int16_t hiImm = 0;
	int16_t loImm = 0;
	// First instruction can be either an addis if the frame size is larger then
	// 32K, or an addi if the size is less then 32K.
	int32_t firstInstr = read32(loc + 4);
	if (getPrimaryOpCode(firstInstr) == 15) {
	hiImm = firstInstr & 0xFFFF;
	} else if (getPrimaryOpCode(firstInstr) == 14) {
	loImm = firstInstr & 0xFFFF;
	} else {
	return false;
	}

	// Second instruction is either an addi or a nop. If the first instruction was
	// an addi then LoImm is set and the second instruction must be a nop.
	uint32_t secondInstr = read32(loc + 8);
	if (!loImm && getPrimaryOpCode(secondInstr) == 14) {
	loImm = secondInstr & 0xFFFF;
	} else if (secondInstr != 0x60000000) {
	return false;
	}

	// The register operands of the first instruction should be the stack-pointer
	// (r1) as the input (RA) and r12 as the output (RT). If the second
	// instruction is not a nop, then it should use r12 as both input and output.
	auto checkRegOperands = [](uint32_t instr, uint8_t expectedRT,
	uint8_t expectedRA) {
	return ((instr & 0x3E00000) >> 21 == expectedRT) &&
	((instr & 0x1F0000) >> 16 == expectedRA);
	};
	if (!checkRegOperands(firstInstr, 12, 1))
	return false;
	if (secondInstr != 0x60000000 && !checkRegOperands(secondInstr, 12, 12))
	return false;

	int32_t stackFrameSize = (hiImm * 65536) + loImm;
	// Check that the adjusted size doesn't overflow what we can represent with 2
	// instructions.
	if (stackFrameSize < config->splitStackAdjustSize + INT32_MIN) {
	error(getErrorLocation(loc) + "split-stack prologue adjustment overflows");
	return false;
	}

	int32_t adjustedStackFrameSize =
	stackFrameSize - config->splitStackAdjustSize;

	loImm = adjustedStackFrameSize & 0xFFFF;
	hiImm = (adjustedStackFrameSize + 0x8000) >> 16;
	if (hiImm) {
	write32(loc + 4, 0x3D810000 \| (uint16_t)hiImm);
	// If the low immediate is zero the second instruction will be a nop.
	secondInstr = loImm ? 0x398C0000 \| (uint16_t)loImm : 0x60000000;
	write32(loc + 8, secondInstr);
	} else {
	// addi r12, r1, imm
	write32(loc + 4, (0x39810000) \| (uint16_t)loImm);
	write32(loc + 8, 0x60000000);
	}

	return true;
	}

	TargetInfo *elf::getPPC64TargetInfo() {
	static PPC64 target;
	return &target;
	}
	Index: projects/clang900-import/contrib/llvm/tools/lld/ELF/InputFiles.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/ELF/InputFiles.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/ELF/InputFiles.cpp (revision 351722)
	@@ -1,1645 +1,1645 @@
	//===- InputFiles.cpp -----------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "InputFiles.h"
	#include "Driver.h"
	#include "InputSection.h"
	#include "LinkerScript.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "SyntheticSections.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/DebugInfo/DWARF/DWARFContext.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/LTO/LTO.h"
	#include "llvm/MC/StringTableBuilder.h"
	#include "llvm/Object/ELFObjectFile.h"
	#include "llvm/Support/ARMAttributeParser.h"
	#include "llvm/Support/ARMBuildAttributes.h"
	#include "llvm/Support/Endian.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/TarWriter.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;
	using namespace llvm::ELF;
	using namespace llvm::object;
	using namespace llvm::sys;
	using namespace llvm::sys::fs;
	using namespace llvm::support::endian;

	using namespace lld;
	using namespace lld::elf;

	bool InputFile::isInGroup;
	uint32_t InputFile::nextGroupId;
	std::vector<BinaryFile *> elf::binaryFiles;
	std::vector<BitcodeFile *> elf::bitcodeFiles;
	std::vector<LazyObjFile *> elf::lazyObjFiles;
	std::vector<InputFile *> elf::objectFiles;
	std::vector<SharedFile *> elf::sharedFiles;

	std::unique_ptr<TarWriter> elf::tar;

	static ELFKind getELFKind(MemoryBufferRef mb, StringRef archiveName) {
	unsigned char size;
	unsigned char endian;
	std::tie(size, endian) = getElfArchType(mb.getBuffer());

	auto report = [&](StringRef msg) {
	StringRef filename = mb.getBufferIdentifier();
	if (archiveName.empty())
	fatal(filename + ": " + msg);
	else
	fatal(archiveName + "(" + filename + "): " + msg);
	};

	if (!mb.getBuffer().startswith(ElfMagic))
	report("not an ELF file");
	if (endian != ELFDATA2LSB && endian != ELFDATA2MSB)
	report("corrupted ELF file: invalid data encoding");
	if (size != ELFCLASS32 && size != ELFCLASS64)
	report("corrupted ELF file: invalid file class");

	size_t bufSize = mb.getBuffer().size();
	if ((size == ELFCLASS32 && bufSize < sizeof(Elf32_Ehdr)) \|\|
	(size == ELFCLASS64 && bufSize < sizeof(Elf64_Ehdr)))
	report("corrupted ELF file: file is too short");

	if (size == ELFCLASS32)
	return (endian == ELFDATA2LSB) ? ELF32LEKind : ELF32BEKind;
	return (endian == ELFDATA2LSB) ? ELF64LEKind : ELF64BEKind;
	}

	InputFile::InputFile(Kind k, MemoryBufferRef m)
	: mb(m), groupId(nextGroupId), fileKind(k) {
	// All files within the same --{start,end}-group get the same group ID.
	// Otherwise, a new file will get a new group ID.
	if (!isInGroup)
	++nextGroupId;
	}

	Optional<MemoryBufferRef> elf::readFile(StringRef path) {
	// The --chroot option changes our virtual root directory.
	// This is useful when you are dealing with files created by --reproduce.
	if (!config->chroot.empty() && path.startswith("/"))
	path = saver.save(config->chroot + path);

	log(path);

	auto mbOrErr = MemoryBuffer::getFile(path, -1, false);
	if (auto ec = mbOrErr.getError()) {
	error("cannot open " + path + ": " + ec.message());
	return None;
	}

	std::unique_ptr<MemoryBuffer> &mb = *mbOrErr;
	MemoryBufferRef mbref = mb->getMemBufferRef();
	make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take MB ownership

	if (tar)
	tar->append(relativeToRoot(path), mbref.getBuffer());
	return mbref;
	}

	// All input object files must be for the same architecture
	// (e.g. it does not make sense to link x86 object files with
	// MIPS object files.) This function checks for that error.
	static bool isCompatible(InputFile *file) {
	if (!file->isElf() && !isa<BitcodeFile>(file))
	return true;

	if (file->ekind == config->ekind && file->emachine == config->emachine) {
	if (config->emachine != EM_MIPS)
	return true;
	if (isMipsN32Abi(file) == config->mipsN32Abi)
	return true;
	}

	if (!config->emulation.empty()) {
	error(toString(file) + " is incompatible with " + config->emulation);
	} else {
	InputFile *existing;
	if (!objectFiles.empty())
	existing = objectFiles[0];
	else if (!sharedFiles.empty())
	existing = sharedFiles[0];
	else
	existing = bitcodeFiles[0];

	error(toString(file) + " is incompatible with " + toString(existing));
	}

	return false;
	}

	template <class ELFT> static void doParseFile(InputFile *file) {
	if (!isCompatible(file))
	return;

	// Binary file
	if (auto *f = dyn_cast<BinaryFile>(file)) {
	binaryFiles.push_back(f);
	f->parse();
	return;
	}

	// .a file
	if (auto *f = dyn_cast<ArchiveFile>(file)) {
	f->parse();
	return;
	}

	// Lazy object file
	if (auto *f = dyn_cast<LazyObjFile>(file)) {
	lazyObjFiles.push_back(f);
	f->parse<ELFT>();
	return;
	}

	if (config->trace)
	message(toString(file));

	// .so file
	if (auto *f = dyn_cast<SharedFile>(file)) {
	f->parse<ELFT>();
	return;
	}

	// LLVM bitcode file
	if (auto *f = dyn_cast<BitcodeFile>(file)) {
	bitcodeFiles.push_back(f);
	f->parse<ELFT>();
	return;
	}

	// Regular object file
	objectFiles.push_back(file);
	cast<ObjFile<ELFT>>(file)->parse();
	}

	// Add symbols in File to the symbol table.
	void elf::parseFile(InputFile *file) {
	switch (config->ekind) {
	case ELF32LEKind:
	doParseFile<ELF32LE>(file);
	return;
	case ELF32BEKind:
	doParseFile<ELF32BE>(file);
	return;
	case ELF64LEKind:
	doParseFile<ELF64LE>(file);
	return;
	case ELF64BEKind:
	doParseFile<ELF64BE>(file);
	return;
	default:
	llvm_unreachable("unknown ELFT");
	}
	}

	// Concatenates arguments to construct a string representing an error location.
	static std::string createFileLineMsg(StringRef path, unsigned line) {
	std::string filename = path::filename(path);
	std::string lineno = ":" + std::to_string(line);
	if (filename == path)
	return filename + lineno;
	return filename + lineno + " (" + path.str() + lineno + ")";
	}

	template <class ELFT>
	static std::string getSrcMsgAux(ObjFile<ELFT> &file, const Symbol &sym,
	InputSectionBase &sec, uint64_t offset) {
	// In DWARF, functions and variables are stored to different places.
	// First, lookup a function for a given offset.
	if (Optional<DILineInfo> info = file.getDILineInfo(&sec, offset))
	return createFileLineMsg(info->FileName, info->Line);

	// If it failed, lookup again as a variable.
	if (Optional<std::pair<std::string, unsigned>> fileLine =
	file.getVariableLoc(sym.getName()))
	return createFileLineMsg(fileLine->first, fileLine->second);

	// File.sourceFile contains STT_FILE symbol, and that is a last resort.
	return file.sourceFile;
	}

	std::string InputFile::getSrcMsg(const Symbol &sym, InputSectionBase &sec,
	uint64_t offset) {
	if (kind() != ObjKind)
	return "";
	switch (config->ekind) {
	default:
	llvm_unreachable("Invalid kind");
	case ELF32LEKind:
	return getSrcMsgAux(cast<ObjFile<ELF32LE>>(*this), sym, sec, offset);
	case ELF32BEKind:
	return getSrcMsgAux(cast<ObjFile<ELF32BE>>(*this), sym, sec, offset);
	case ELF64LEKind:
	return getSrcMsgAux(cast<ObjFile<ELF64LE>>(*this), sym, sec, offset);
	case ELF64BEKind:
	return getSrcMsgAux(cast<ObjFile<ELF64BE>>(*this), sym, sec, offset);
	}
	}

	template <class ELFT> void ObjFile<ELFT>::initializeDwarf() {
	dwarf = llvm::make_unique<DWARFContext>(make_unique<LLDDwarfObj<ELFT>>(this));
	for (std::unique_ptr<DWARFUnit> &cu : dwarf->compile_units()) {
	auto report = [](Error err) {
	handleAllErrors(std::move(err),
	[](ErrorInfoBase &info) { warn(info.message()); });
	};
	Expected<const DWARFDebugLine::LineTable *> expectedLT =
	dwarf->getLineTableForUnit(cu.get(), report);
	const DWARFDebugLine::LineTable *lt = nullptr;
	if (expectedLT)
	lt = *expectedLT;
	else
	report(expectedLT.takeError());
	if (!lt)
	continue;
	lineTables.push_back(lt);

	// Loop over variable records and insert them to variableLoc.
	for (const auto &entry : cu->dies()) {
	DWARFDie die(cu.get(), &entry);
	// Skip all tags that are not variables.
	if (die.getTag() != dwarf::DW_TAG_variable)
	continue;

	// Skip if a local variable because we don't need them for generating
	// error messages. In general, only non-local symbols can fail to be
	// linked.
	if (!dwarf::toUnsigned(die.find(dwarf::DW_AT_external), 0))
	continue;

	// Get the source filename index for the variable.
	unsigned file = dwarf::toUnsigned(die.find(dwarf::DW_AT_decl_file), 0);
	if (!lt->hasFileAtIndex(file))
	continue;

	// Get the line number on which the variable is declared.
	unsigned line = dwarf::toUnsigned(die.find(dwarf::DW_AT_decl_line), 0);

	// Here we want to take the variable name to add it into variableLoc.
	// Variable can have regular and linkage name associated. At first, we try
	// to get linkage name as it can be different, for example when we have
	// two variables in different namespaces of the same object. Use common
	// name otherwise, but handle the case when it also absent in case if the
	// input object file lacks some debug info.
	StringRef name =
	dwarf::toString(die.find(dwarf::DW_AT_linkage_name),
	dwarf::toString(die.find(dwarf::DW_AT_name), ""));
	if (!name.empty())
	variableLoc.insert({name, {lt, file, line}});
	}
	}
	}

	// Returns the pair of file name and line number describing location of data
	// object (variable, array, etc) definition.
	template <class ELFT>
	Optional<std::pair<std::string, unsigned>>
	ObjFile<ELFT>::getVariableLoc(StringRef name) {
	llvm::call_once(initDwarfLine, [this]() { initializeDwarf(); });

	// Return if we have no debug information about data object.
	auto it = variableLoc.find(name);
	if (it == variableLoc.end())
	return None;

	// Take file name string from line table.
	std::string fileName;
	if (!it->second.lt->getFileNameByIndex(
	it->second.file, {},
	DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, fileName))
	return None;

	return std::make_pair(fileName, it->second.line);
	}

	// Returns source line information for a given offset
	// using DWARF debug info.
	template <class ELFT>
	Optional<DILineInfo> ObjFile<ELFT>::getDILineInfo(InputSectionBase *s,
	uint64_t offset) {
	llvm::call_once(initDwarfLine, [this]() { initializeDwarf(); });

	// Detect SectionIndex for specified section.
	uint64_t sectionIndex = object::SectionedAddress::UndefSection;
	ArrayRef<InputSectionBase *> sections = s->file->getSections();
	for (uint64_t curIndex = 0; curIndex < sections.size(); ++curIndex) {
	if (s == sections[curIndex]) {
	sectionIndex = curIndex;
	break;
	}
	}

	// Use fake address calcuated by adding section file offset and offset in
	// section. See comments for ObjectInfo class.
	DILineInfo info;
	for (const llvm::DWARFDebugLine::LineTable *lt : lineTables) {
	if (lt->getFileLineInfoForAddress(
	{s->getOffsetInFile() + offset, sectionIndex}, nullptr,
	DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, info))
	return info;
	}
	return None;
	}

	// Returns "<internal>", "foo.a(bar.o)" or "baz.o".
	std::string lld::toString(const InputFile *f) {
	if (!f)
	return "<internal>";

	if (f->toStringCache.empty()) {
	if (f->archiveName.empty())
	f->toStringCache = f->getName();
	else
	f->toStringCache = (f->archiveName + "(" + f->getName() + ")").str();
	}
	return f->toStringCache;
	}

	ELFFileBase::ELFFileBase(Kind k, MemoryBufferRef mb) : InputFile(k, mb) {
	ekind = getELFKind(mb, "");

	switch (ekind) {
	case ELF32LEKind:
	init<ELF32LE>();
	break;
	case ELF32BEKind:
	init<ELF32BE>();
	break;
	case ELF64LEKind:
	init<ELF64LE>();
	break;
	case ELF64BEKind:
	init<ELF64BE>();
	break;
	default:
	llvm_unreachable("getELFKind");
	}
	}

	template <typename Elf_Shdr>
	static const Elf_Shdr *findSection(ArrayRef<Elf_Shdr> sections, uint32_t type) {
	for (const Elf_Shdr &sec : sections)
	if (sec.sh_type == type)
	return &sec;
	return nullptr;
	}

	template <class ELFT> void ELFFileBase::init() {
	using Elf_Shdr = typename ELFT::Shdr;
	using Elf_Sym = typename ELFT::Sym;

	// Initialize trivial attributes.
	const ELFFile<ELFT> &obj = getObj<ELFT>();
	emachine = obj.getHeader()->e_machine;
	osabi = obj.getHeader()->e_ident[llvm::ELF::EI_OSABI];
	abiVersion = obj.getHeader()->e_ident[llvm::ELF::EI_ABIVERSION];

	ArrayRef<Elf_Shdr> sections = CHECK(obj.sections(), this);

	// Find a symbol table.
	bool isDSO =
	(identify_magic(mb.getBuffer()) == file_magic::elf_shared_object);
	const Elf_Shdr *symtabSec =
	findSection(sections, isDSO ? SHT_DYNSYM : SHT_SYMTAB);

	if (!symtabSec)
	return;

	// Initialize members corresponding to a symbol table.
	firstGlobal = symtabSec->sh_info;

	ArrayRef<Elf_Sym> eSyms = CHECK(obj.symbols(symtabSec), this);
	if (firstGlobal == 0 \|\| firstGlobal > eSyms.size())
	fatal(toString(this) + ": invalid sh_info in symbol table");

	elfSyms = reinterpret_cast<const void *>(eSyms.data());
	numELFSyms = eSyms.size();
	stringTable = CHECK(obj.getStringTableForSymtab(*symtabSec, sections), this);
	}

	template <class ELFT>
	uint32_t ObjFile<ELFT>::getSectionIndex(const Elf_Sym &sym) const {
	return CHECK(
	this->getObj().getSectionIndex(&sym, getELFSyms<ELFT>(), shndxTable),
	this);
	}

	template <class ELFT> ArrayRef<Symbol *> ObjFile<ELFT>::getLocalSymbols() {
	if (this->symbols.empty())
	return {};
	return makeArrayRef(this->symbols).slice(1, this->firstGlobal - 1);
	}

	template <class ELFT> ArrayRef<Symbol *> ObjFile<ELFT>::getGlobalSymbols() {
	return makeArrayRef(this->symbols).slice(this->firstGlobal);
	}

	template <class ELFT> void ObjFile<ELFT>::parse(bool ignoreComdats) {
	// Read a section table. justSymbols is usually false.
	if (this->justSymbols)
	initializeJustSymbols();
	else
	initializeSections(ignoreComdats);

	// Read a symbol table.
	initializeSymbols();
	}

	// Sections with SHT_GROUP and comdat bits define comdat section groups.
	// They are identified and deduplicated by group name. This function
	// returns a group name.
	template <class ELFT>
	StringRef ObjFile<ELFT>::getShtGroupSignature(ArrayRef<Elf_Shdr> sections,
	const Elf_Shdr &sec) {
	typename ELFT::SymRange symbols = this->getELFSyms<ELFT>();
	if (sec.sh_info >= symbols.size())
	fatal(toString(this) + ": invalid symbol index");
	const typename ELFT::Sym &sym = symbols[sec.sh_info];
	StringRef signature = CHECK(sym.getName(this->stringTable), this);

	// As a special case, if a symbol is a section symbol and has no name,
	// we use a section name as a signature.
	//
	// Such SHT_GROUP sections are invalid from the perspective of the ELF
	// standard, but GNU gold 1.14 (the newest version as of July 2017) or
	// older produce such sections as outputs for the -r option, so we need
	// a bug-compatibility.
	if (signature.empty() && sym.getType() == STT_SECTION)
	return getSectionName(sec);
	return signature;
	}

	template <class ELFT> bool ObjFile<ELFT>::shouldMerge(const Elf_Shdr &sec) {
	// On a regular link we don't merge sections if -O0 (default is -O1). This
	// sometimes makes the linker significantly faster, although the output will
	// be bigger.
	//
	// Doing the same for -r would create a problem as it would combine sections
	// with different sh_entsize. One option would be to just copy every SHF_MERGE
	// section as is to the output. While this would produce a valid ELF file with
	// usable SHF_MERGE sections, tools like (llvm-)?dwarfdump get confused when
	// they see two .debug_str. We could have separate logic for combining
	// SHF_MERGE sections based both on their name and sh_entsize, but that seems
	// to be more trouble than it is worth. Instead, we just use the regular (-O1)
	// logic for -r.
	if (config->optimize == 0 && !config->relocatable)
	return false;

	// A mergeable section with size 0 is useless because they don't have
	// any data to merge. A mergeable string section with size 0 can be
	// argued as invalid because it doesn't end with a null character.
	// We'll avoid a mess by handling them as if they were non-mergeable.
	if (sec.sh_size == 0)
	return false;

	// Check for sh_entsize. The ELF spec is not clear about the zero
	// sh_entsize. It says that "the member [sh_entsize] contains 0 if
	// the section does not hold a table of fixed-size entries". We know
	// that Rust 1.13 produces a string mergeable section with a zero
	// sh_entsize. Here we just accept it rather than being picky about it.
	uint64_t entSize = sec.sh_entsize;
	if (entSize == 0)
	return false;
	if (sec.sh_size % entSize)
	fatal(toString(this) +
	": SHF_MERGE section size must be a multiple of sh_entsize");

	uint64_t flags = sec.sh_flags;
	if (!(flags & SHF_MERGE))
	return false;
	if (flags & SHF_WRITE)
	fatal(toString(this) + ": writable SHF_MERGE section is not supported");

	return true;
	}

	// This is for --just-symbols.
	//
	// --just-symbols is a very minor feature that allows you to link your
	// output against other existing program, so that if you load both your
	// program and the other program into memory, your output can refer the
	// other program's symbols.
	//
	// When the option is given, we link "just symbols". The section table is
	// initialized with null pointers.
	template <class ELFT> void ObjFile<ELFT>::initializeJustSymbols() {
	ArrayRef<Elf_Shdr> sections = CHECK(this->getObj().sections(), this);
	this->sections.resize(sections.size());
	}

	// An ELF object file may contain a `.deplibs` section. If it exists, the
	// section contains a list of library specifiers such as `m` for libm. This
	// function resolves a given name by finding the first matching library checking
	// the various ways that a library can be specified to LLD. This ELF extension
	// is a form of autolinking and is called `dependent libraries`. It is currently
	// unique to LLVM and lld.
	static void addDependentLibrary(StringRef specifier, const InputFile *f) {
	if (!config->dependentLibraries)
	return;
	if (fs::exists(specifier))
	driver->addFile(specifier, /withLOption=/false);
	else if (Optional<std::string> s = findFromSearchPaths(specifier))
	driver->addFile(s, /withLOption=*/true);
	else if (Optional<std::string> s = searchLibraryBaseName(specifier))
	driver->addFile(s, /withLOption=*/true);
	else
	error(toString(f) +
	": unable to find library from dependent library specifier: " +
	specifier);
	}

	template <class ELFT>
	void ObjFile<ELFT>::initializeSections(bool ignoreComdats) {
	const ELFFile<ELFT> &obj = this->getObj();

	ArrayRef<Elf_Shdr> objSections = CHECK(obj.sections(), this);
	uint64_t size = objSections.size();
	this->sections.resize(size);
	this->sectionStringTable =
	CHECK(obj.getSectionStringTable(objSections), this);

	for (size_t i = 0, e = objSections.size(); i < e; i++) {
	if (this->sections[i] == &InputSection::discarded)
	continue;
	const Elf_Shdr &sec = objSections[i];

	if (sec.sh_type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE)
	cgProfile =
	check(obj.template getSectionContentsAsArray<Elf_CGProfile>(&sec));

	// SHF_EXCLUDE'ed sections are discarded by the linker. However,
	// if -r is given, we'll let the final link discard such sections.
	// This is compatible with GNU.
	if ((sec.sh_flags & SHF_EXCLUDE) && !config->relocatable) {
	if (sec.sh_type == SHT_LLVM_ADDRSIG) {
	// We ignore the address-significance table if we know that the object
	// file was created by objcopy or ld -r. This is because these tools
	// will reorder the symbols in the symbol table, invalidating the data
	// in the address-significance table, which refers to symbols by index.
	if (sec.sh_link != 0)
	this->addrsigSec = &sec;
	else if (config->icf == ICFLevel::Safe)
	warn(toString(this) + ": --icf=safe is incompatible with object "
	"files created using objcopy or ld -r");
	}
	this->sections[i] = &InputSection::discarded;
	continue;
	}

	switch (sec.sh_type) {
	case SHT_GROUP: {
	// De-duplicate section groups by their signatures.
	StringRef signature = getShtGroupSignature(objSections, sec);
	this->sections[i] = &InputSection::discarded;


	ArrayRef<Elf_Word> entries =
	CHECK(obj.template getSectionContentsAsArray<Elf_Word>(&sec), this);
	if (entries.empty())
	fatal(toString(this) + ": empty SHT_GROUP");

	// The first word of a SHT_GROUP section contains flags. Currently,
	// the standard defines only "GRP_COMDAT" flag for the COMDAT group.
	// An group with the empty flag doesn't define anything; such sections
	// are just skipped.
	if (entries[0] == 0)
	continue;

	if (entries[0] != GRP_COMDAT)
	fatal(toString(this) + ": unsupported SHT_GROUP format");

	bool isNew =
	ignoreComdats \|\|
	symtab->comdatGroups.try_emplace(CachedHashStringRef(signature), this)
	.second;
	if (isNew) {
	if (config->relocatable)
	this->sections[i] = createInputSection(sec);
	continue;
	}

	// Otherwise, discard group members.
	for (uint32_t secIndex : entries.slice(1)) {
	if (secIndex >= size)
	fatal(toString(this) +
	": invalid section index in group: " + Twine(secIndex));
	this->sections[secIndex] = &InputSection::discarded;
	}
	break;
	}
	case SHT_SYMTAB_SHNDX:
	shndxTable = CHECK(obj.getSHNDXTable(sec, objSections), this);
	break;
	case SHT_SYMTAB:
	case SHT_STRTAB:
	case SHT_NULL:
	break;
	default:
	this->sections[i] = createInputSection(sec);
	}

	// .ARM.exidx sections have a reverse dependency on the InputSection they
	// have a SHF_LINK_ORDER dependency, this is identified by the sh_link.
	if (sec.sh_flags & SHF_LINK_ORDER) {
	InputSectionBase *linkSec = nullptr;
	if (sec.sh_link < this->sections.size())
	linkSec = this->sections[sec.sh_link];
	if (!linkSec)
	fatal(toString(this) +
	": invalid sh_link index: " + Twine(sec.sh_link));

	InputSection *isec = cast<InputSection>(this->sections[i]);
	linkSec->dependentSections.push_back(isec);
	if (!isa<InputSection>(linkSec))
	error("a section " + isec->name +
	" with SHF_LINK_ORDER should not refer a non-regular "
	"section: " +
	toString(linkSec));
	}
	}
	}

	// For ARM only, to set the EF_ARM_ABI_FLOAT_SOFT or EF_ARM_ABI_FLOAT_HARD
	// flag in the ELF Header we need to look at Tag_ABI_VFP_args to find out how
	// the input objects have been compiled.
	static void updateARMVFPArgs(const ARMAttributeParser &attributes,
	const InputFile *f) {
	if (!attributes.hasAttribute(ARMBuildAttrs::ABI_VFP_args))
	// If an ABI tag isn't present then it is implicitly given the value of 0
	// which maps to ARMBuildAttrs::BaseAAPCS. However many assembler files,
	// including some in glibc that don't use FP args (and should have value 3)
	// don't have the attribute so we do not consider an implicit value of 0
	// as a clash.
	return;

	unsigned vfpArgs = attributes.getAttributeValue(ARMBuildAttrs::ABI_VFP_args);
	ARMVFPArgKind arg;
	switch (vfpArgs) {
	case ARMBuildAttrs::BaseAAPCS:
	arg = ARMVFPArgKind::Base;
	break;
	case ARMBuildAttrs::HardFPAAPCS:
	arg = ARMVFPArgKind::VFP;
	break;
	case ARMBuildAttrs::ToolChainFPPCS:
	// Tool chain specific convention that conforms to neither AAPCS variant.
	arg = ARMVFPArgKind::ToolChain;
	break;
	case ARMBuildAttrs::CompatibleFPAAPCS:
	// Object compatible with all conventions.
	return;
	default:
	error(toString(f) + ": unknown Tag_ABI_VFP_args value: " + Twine(vfpArgs));
	return;
	}
	// Follow ld.bfd and error if there is a mix of calling conventions.
	if (config->armVFPArgs != arg && config->armVFPArgs != ARMVFPArgKind::Default)
	error(toString(f) + ": incompatible Tag_ABI_VFP_args");
	else
	config->armVFPArgs = arg;
	}

	// The ARM support in lld makes some use of instructions that are not available
	// on all ARM architectures. Namely:
	// - Use of BLX instruction for interworking between ARM and Thumb state.
	// - Use of the extended Thumb branch encoding in relocation.
	// - Use of the MOVT/MOVW instructions in Thumb Thunks.
	// The ARM Attributes section contains information about the architecture chosen
	// at compile time. We follow the convention that if at least one input object
	// is compiled with an architecture that supports these features then lld is
	// permitted to use them.
	static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) {
	if (!attributes.hasAttribute(ARMBuildAttrs::CPU_arch))
	return;
	auto arch = attributes.getAttributeValue(ARMBuildAttrs::CPU_arch);
	switch (arch) {
	case ARMBuildAttrs::Pre_v4:
	case ARMBuildAttrs::v4:
	case ARMBuildAttrs::v4T:
	// Architectures prior to v5 do not support BLX instruction
	break;
	case ARMBuildAttrs::v5T:
	case ARMBuildAttrs::v5TE:
	case ARMBuildAttrs::v5TEJ:
	case ARMBuildAttrs::v6:
	case ARMBuildAttrs::v6KZ:
	case ARMBuildAttrs::v6K:
	config->armHasBlx = true;
	// Architectures used in pre-Cortex processors do not support
	// The J1 = 1 J2 = 1 Thumb branch range extension, with the exception
	// of Architecture v6T2 (arm1156t2-s and arm1156t2f-s) that do.
	break;
	default:
	// All other Architectures have BLX and extended branch encoding
	config->armHasBlx = true;
	config->armJ1J2BranchEncoding = true;
	if (arch != ARMBuildAttrs::v6_M && arch != ARMBuildAttrs::v6S_M)
	// All Architectures used in Cortex processors with the exception
	// of v6-M and v6S-M have the MOVT and MOVW instructions.
	config->armHasMovtMovw = true;
	break;
	}
	}

	// If a source file is compiled with x86 hardware-assisted call flow control
	// enabled, the generated object file contains feature flags indicating that
	// fact. This function reads the feature flags and returns it.
	//
	// Essentially we want to read a single 32-bit value in this function, but this
	// function is rather complicated because the value is buried deep inside a
	// .note.gnu.property section.
	//
	// The section consists of one or more NOTE records. Each NOTE record consists
	// of zero or more type-length-value fields. We want to find a field of a
	// certain type. It seems a bit too much to just store a 32-bit value, perhaps
	// the ABI is unnecessarily complicated.
	template <class ELFT>
	static uint32_t readAndFeatures(ObjFile<ELFT> *obj, ArrayRef<uint8_t> data) {
	using Elf_Nhdr = typename ELFT::Nhdr;
	using Elf_Note = typename ELFT::Note;

	uint32_t featuresSet = 0;
	while (!data.empty()) {
	// Read one NOTE record.
	if (data.size() < sizeof(Elf_Nhdr))
	fatal(toString(obj) + ": .note.gnu.property: section too short");

	auto nhdr = reinterpret_cast<const Elf_Nhdr >(data.data());
	if (data.size() < nhdr->getSize())
	fatal(toString(obj) + ": .note.gnu.property: section too short");

	Elf_Note note(*nhdr);
	if (nhdr->n_type != NT_GNU_PROPERTY_TYPE_0 \|\| note.getName() != "GNU") {
	data = data.slice(nhdr->getSize());
	continue;
	}

	uint32_t featureAndType = config->emachine == EM_AARCH64
	? GNU_PROPERTY_AARCH64_FEATURE_1_AND
	: GNU_PROPERTY_X86_FEATURE_1_AND;

	// Read a body of a NOTE record, which consists of type-length-value fields.
	ArrayRef<uint8_t> desc = note.getDesc();
	while (!desc.empty()) {
	if (desc.size() < 8)
	fatal(toString(obj) + ": .note.gnu.property: section too short");

	uint32_t type = read32le(desc.data());
	uint32_t size = read32le(desc.data() + 4);

	if (type == featureAndType) {
	// We found a FEATURE_1_AND field. There may be more than one of these
	// in a .note.gnu.propery section, for a relocatable object we
	// accumulate the bits set.
	featuresSet \|= read32le(desc.data() + 8);
	}

	// On 64-bit, a payload may be followed by a 4-byte padding to make its
	// size a multiple of 8.
	if (ELFT::Is64Bits)
	size = alignTo(size, 8);

	desc = desc.slice(size + 8); // +8 for Type and Size
	}

	// Go to next NOTE record to look for more FEATURE_1_AND descriptions.
	data = data.slice(nhdr->getSize());
	}

	return featuresSet;
	}

	template <class ELFT>
	InputSectionBase *ObjFile<ELFT>::getRelocTarget(const Elf_Shdr &sec) {
	uint32_t idx = sec.sh_info;
	if (idx >= this->sections.size())
	fatal(toString(this) + ": invalid relocated section index: " + Twine(idx));
	InputSectionBase *target = this->sections[idx];

	// Strictly speaking, a relocation section must be included in the
	// group of the section it relocates. However, LLVM 3.3 and earlier
	// would fail to do so, so we gracefully handle that case.
	if (target == &InputSection::discarded)
	return nullptr;

	if (!target)
	fatal(toString(this) + ": unsupported relocation reference");
	return target;
	}

	// Create a regular InputSection class that has the same contents
	// as a given section.
	static InputSection toRegularSection(MergeInputSection sec) {
	return make<InputSection>(sec->file, sec->flags, sec->type, sec->alignment,
	sec->data(), sec->name);
	}

	template <class ELFT>
	InputSectionBase *ObjFile<ELFT>::createInputSection(const Elf_Shdr &sec) {
	StringRef name = getSectionName(sec);

	switch (sec.sh_type) {
	case SHT_ARM_ATTRIBUTES: {
	if (config->emachine != EM_ARM)
	break;
	ARMAttributeParser attributes;
	ArrayRef<uint8_t> contents = check(this->getObj().getSectionContents(&sec));
	attributes.Parse(contents, /isLittle/ config->ekind == ELF32LEKind);
	updateSupportedARMFeatures(attributes);
	updateARMVFPArgs(attributes, this);

	// FIXME: Retain the first attribute section we see. The eglibc ARM
	// dynamic loaders require the presence of an attribute section for dlopen
	// to work. In a full implementation we would merge all attribute sections.
	if (in.armAttributes == nullptr) {
	in.armAttributes = make<InputSection>(*this, sec, name);
	return in.armAttributes;
	}
	return &InputSection::discarded;
	}
	case SHT_LLVM_DEPENDENT_LIBRARIES: {
	if (config->relocatable)
	break;
	ArrayRef<char> data =
	CHECK(this->getObj().template getSectionContentsAsArray<char>(&sec), this);
	if (!data.empty() && data.back() != '\0') {
	error(toString(this) +
	": corrupted dependent libraries section (unterminated string): " +
	name);
	return &InputSection::discarded;
	}
	for (const char d = data.begin(), e = data.end(); d < e;) {
	StringRef s(d);
	addDependentLibrary(s, this);
	d += s.size() + 1;
	}
	return &InputSection::discarded;
	}
	case SHT_RELA:
	case SHT_REL: {
	// Find a relocation target section and associate this section with that.
	// Target may have been discarded if it is in a different section group
	// and the group is discarded, even though it's a violation of the
	// spec. We handle that situation gracefully by discarding dangling
	// relocation sections.
	InputSectionBase *target = getRelocTarget(sec);
	if (!target)
	return nullptr;

	// This section contains relocation information.
	// If -r is given, we do not interpret or apply relocation
	// but just copy relocation sections to output.
	if (config->relocatable) {
	InputSection relocSec = make<InputSection>(this, sec, name);
	// We want to add a dependency to target, similar like we do for
	// -emit-relocs below. This is useful for the case when linker script
	// contains the "/DISCARD/". It is perhaps uncommon to use a script with
	// -r, but we faced it in the Linux kernel and have to handle such case
	// and not to crash.
	target->dependentSections.push_back(relocSec);
	return relocSec;
	}

	if (target->firstRelocation)
	fatal(toString(this) +
	": multiple relocation sections to one section are not supported");

	// ELF spec allows mergeable sections with relocations, but they are
	// rare, and it is in practice hard to merge such sections by contents,
	// because applying relocations at end of linking changes section
	// contents. So, we simply handle such sections as non-mergeable ones.
	// Degrading like this is acceptable because section merging is optional.
	if (auto *ms = dyn_cast<MergeInputSection>(target)) {
	target = toRegularSection(ms);
	this->sections[sec.sh_info] = target;
	}

	if (sec.sh_type == SHT_RELA) {
	ArrayRef<Elf_Rela> rels = CHECK(getObj().relas(&sec), this);
	target->firstRelocation = rels.begin();
	target->numRelocations = rels.size();
	target->areRelocsRela = true;
	} else {
	ArrayRef<Elf_Rel> rels = CHECK(getObj().rels(&sec), this);
	target->firstRelocation = rels.begin();
	target->numRelocations = rels.size();
	target->areRelocsRela = false;
	}
	assert(isUInt<31>(target->numRelocations));

	// Relocation sections processed by the linker are usually removed
	// from the output, so returning `nullptr` for the normal case.
	// However, if -emit-relocs is given, we need to leave them in the output.
	// (Some post link analysis tools need this information.)
	if (config->emitRelocs) {
	InputSection relocSec = make<InputSection>(this, sec, name);
	// We will not emit relocation section if target was discarded.
	target->dependentSections.push_back(relocSec);
	return relocSec;
	}
	return nullptr;
	}
	}

	// The GNU linker uses .note.GNU-stack section as a marker indicating
	// that the code in the object file does not expect that the stack is
	// executable (in terms of NX bit). If all input files have the marker,
	// the GNU linker adds a PT_GNU_STACK segment to tells the loader to
	// make the stack non-executable. Most object files have this section as
	// of 2017.
	//
	// But making the stack non-executable is a norm today for security
	// reasons. Failure to do so may result in a serious security issue.
	// Therefore, we make LLD always add PT_GNU_STACK unless it is
	// explicitly told to do otherwise (by -z execstack). Because the stack
	// executable-ness is controlled solely by command line options,
	// .note.GNU-stack sections are simply ignored.
	if (name == ".note.GNU-stack")
	return &InputSection::discarded;

	// Object files that use processor features such as Intel Control-Flow
	// Enforcement (CET) or AArch64 Branch Target Identification BTI, use a
	// .note.gnu.property section containing a bitfield of feature bits like the
	// GNU_PROPERTY_X86_FEATURE_1_IBT flag. Read a bitmap containing the flag.
	//
	// Since we merge bitmaps from multiple object files to create a new
	// .note.gnu.property containing a single AND'ed bitmap, we discard an input
	// file's .note.gnu.property section.
	if (name == ".note.gnu.property") {
	ArrayRef<uint8_t> contents = check(this->getObj().getSectionContents(&sec));
	this->andFeatures = readAndFeatures(this, contents);
	return &InputSection::discarded;
	}

	// Split stacks is a feature to support a discontiguous stack,
	// commonly used in the programming language Go. For the details,
	// see https://gcc.gnu.org/wiki/SplitStacks. An object file compiled
	// for split stack will include a .note.GNU-split-stack section.
	if (name == ".note.GNU-split-stack") {
	if (config->relocatable) {
	error("cannot mix split-stack and non-split-stack in a relocatable link");
	return &InputSection::discarded;
	}
	this->splitStack = true;
	return &InputSection::discarded;
	}

	// An object file cmpiled for split stack, but where some of the
	// functions were compiled with the no_split_stack_attribute will
	// include a .note.GNU-no-split-stack section.
	if (name == ".note.GNU-no-split-stack") {
	this->someNoSplitStack = true;
	return &InputSection::discarded;
	}

	// The linkonce feature is a sort of proto-comdat. Some glibc i386 object
	// files contain definitions of symbol "__x86.get_pc_thunk.bx" in linkonce
	// sections. Drop those sections to avoid duplicate symbol errors.
	// FIXME: This is glibc PR20543, we should remove this hack once that has been
	// fixed for a while.
	if (name == ".gnu.linkonce.t.__x86.get_pc_thunk.bx" \|\|
	name == ".gnu.linkonce.t.__i686.get_pc_thunk.bx")
	return &InputSection::discarded;

	// If we are creating a new .build-id section, strip existing .build-id
	// sections so that the output won't have more than one .build-id.
	// This is not usually a problem because input object files normally don't
	// have .build-id sections, but you can create such files by
	// "ld.{bfd,gold,lld} -r --build-id", and we want to guard against it.
	if (name == ".note.gnu.build-id" && config->buildId != BuildIdKind::None)
	return &InputSection::discarded;

	// The linker merges EH (exception handling) frames and creates a
	// .eh_frame_hdr section for runtime. So we handle them with a special
	// class. For relocatable outputs, they are just passed through.
	if (name == ".eh_frame" && !config->relocatable)
	return make<EhInputSection>(*this, sec, name);

	if (shouldMerge(sec))
	return make<MergeInputSection>(*this, sec, name);
	return make<InputSection>(*this, sec, name);
	}

	template <class ELFT>
	StringRef ObjFile<ELFT>::getSectionName(const Elf_Shdr &sec) {
	return CHECK(getObj().getSectionName(&sec, sectionStringTable), this);
	}

	// Initialize this->Symbols. this->Symbols is a parallel array as
	// its corresponding ELF symbol table.
	template <class ELFT> void ObjFile<ELFT>::initializeSymbols() {
	ArrayRef<Elf_Sym> eSyms = this->getELFSyms<ELFT>();
	this->symbols.resize(eSyms.size());

	// Our symbol table may have already been partially initialized
	// because of LazyObjFile.
	for (size_t i = 0, end = eSyms.size(); i != end; ++i)
	if (!this->symbols[i] && eSyms[i].getBinding() != STB_LOCAL)
	this->symbols[i] =
	symtab->insert(CHECK(eSyms[i].getName(this->stringTable), this));

	// Fill this->Symbols. A symbol is either local or global.
	for (size_t i = 0, end = eSyms.size(); i != end; ++i) {
	const Elf_Sym &eSym = eSyms[i];

	// Read symbol attributes.
	uint32_t secIdx = getSectionIndex(eSym);
	if (secIdx >= this->sections.size())
	fatal(toString(this) + ": invalid section index: " + Twine(secIdx));

	InputSectionBase *sec = this->sections[secIdx];
	uint8_t binding = eSym.getBinding();
	uint8_t stOther = eSym.st_other;
	uint8_t type = eSym.getType();
	uint64_t value = eSym.st_value;
	uint64_t size = eSym.st_size;
	StringRefZ name = this->stringTable.data() + eSym.st_name;

	// Handle local symbols. Local symbols are not added to the symbol
	// table because they are not visible from other object files. We
	// allocate symbol instances and add their pointers to Symbols.
	if (binding == STB_LOCAL) {
	if (eSym.getType() == STT_FILE)
	sourceFile = CHECK(eSym.getName(this->stringTable), this);

	if (this->stringTable.size() <= eSym.st_name)
	fatal(toString(this) + ": invalid symbol name offset");

	if (eSym.st_shndx == SHN_UNDEF)
	this->symbols[i] = make<Undefined>(this, name, binding, stOther, type);
	else if (sec == &InputSection::discarded)
	this->symbols[i] = make<Undefined>(this, name, binding, stOther, type,
	/DiscardedSecIdx=/secIdx);
	else
	this->symbols[i] =
	make<Defined>(this, name, binding, stOther, type, value, size, sec);
	continue;
	}

	// Handle global undefined symbols.
	if (eSym.st_shndx == SHN_UNDEF) {
	this->symbols[i]->resolve(Undefined{this, name, binding, stOther, type});
	continue;
	}

	// Handle global common symbols.
	if (eSym.st_shndx == SHN_COMMON) {
	if (value == 0 \|\| value >= UINT32_MAX)
	fatal(toString(this) + ": common symbol '" + StringRef(name.data) +
	"' has invalid alignment: " + Twine(value));
	this->symbols[i]->resolve(
	CommonSymbol{this, name, binding, stOther, type, value, size});
	continue;
	}

	// If a defined symbol is in a discarded section, handle it as if it
	// were an undefined symbol. Such symbol doesn't comply with the
	// standard, but in practice, a .eh_frame often directly refer
	// COMDAT member sections, and if a comdat group is discarded, some
	// defined symbol in a .eh_frame becomes dangling symbols.
	if (sec == &InputSection::discarded) {
	this->symbols[i]->resolve(
	Undefined{this, name, binding, stOther, type, secIdx});
	continue;
	}

	// Handle global defined symbols.
	if (binding == STB_GLOBAL \|\| binding == STB_WEAK \|\|
	binding == STB_GNU_UNIQUE) {
	this->symbols[i]->resolve(
	Defined{this, name, binding, stOther, type, value, size, sec});
	continue;
	}

	fatal(toString(this) + ": unexpected binding: " + Twine((int)binding));
	}
	}

	ArchiveFile::ArchiveFile(std::unique_ptr<Archive> &&file)
	: InputFile(ArchiveKind, file->getMemoryBufferRef()),
	file(std::move(file)) {}

	void ArchiveFile::parse() {
	for (const Archive::Symbol &sym : file->symbols())
	symtab->addSymbol(LazyArchive{*this, sym});
	}

	// Returns a buffer pointing to a member file containing a given symbol.
	void ArchiveFile::fetch(const Archive::Symbol &sym) {
	Archive::Child c =
	CHECK(sym.getMember(), toString(this) +
	": could not get the member for symbol " +
	- sym.getName());
	+ toELFString(sym));

	if (!seen.insert(c.getChildOffset()).second)
	return;

	MemoryBufferRef mb =
	CHECK(c.getMemoryBufferRef(),
	toString(this) +
	": could not get the buffer for the member defining symbol " +
	- sym.getName());
	+ toELFString(sym));

	if (tar && c.getParent()->isThin())
	tar->append(relativeToRoot(CHECK(c.getFullName(), this)), mb.getBuffer());

	InputFile *file = createObjectFile(
	mb, getName(), c.getParent()->isThin() ? 0 : c.getChildOffset());
	file->groupId = groupId;
	parseFile(file);
	}

	unsigned SharedFile::vernauxNum;

	// Parse the version definitions in the object file if present, and return a
	// vector whose nth element contains a pointer to the Elf_Verdef for version
	// identifier n. Version identifiers that are not definitions map to nullptr.
	template <typename ELFT>
	static std::vector<const void > parseVerdefs(const uint8_t base,
	const typename ELFT::Shdr *sec) {
	if (!sec)
	return {};

	// We cannot determine the largest verdef identifier without inspecting
	// every Elf_Verdef, but both bfd and gold assign verdef identifiers
	// sequentially starting from 1, so we predict that the largest identifier
	// will be verdefCount.
	unsigned verdefCount = sec->sh_info;
	std::vector<const void *> verdefs(verdefCount + 1);

	// Build the Verdefs array by following the chain of Elf_Verdef objects
	// from the start of the .gnu.version_d section.
	const uint8_t *verdef = base + sec->sh_offset;
	for (unsigned i = 0; i != verdefCount; ++i) {
	auto curVerdef = reinterpret_cast<const typename ELFT::Verdef >(verdef);
	verdef += curVerdef->vd_next;
	unsigned verdefIndex = curVerdef->vd_ndx;
	verdefs.resize(verdefIndex + 1);
	verdefs[verdefIndex] = curVerdef;
	}
	return verdefs;
	}

	// We do not usually care about alignments of data in shared object
	// files because the loader takes care of it. However, if we promote a
	// DSO symbol to point to .bss due to copy relocation, we need to keep
	// the original alignment requirements. We infer it in this function.
	template <typename ELFT>
	static uint64_t getAlignment(ArrayRef<typename ELFT::Shdr> sections,
	const typename ELFT::Sym &sym) {
	uint64_t ret = UINT64_MAX;
	if (sym.st_value)
	ret = 1ULL << countTrailingZeros((uint64_t)sym.st_value);
	if (0 < sym.st_shndx && sym.st_shndx < sections.size())
	ret = std::min<uint64_t>(ret, sections[sym.st_shndx].sh_addralign);
	return (ret > UINT32_MAX) ? 0 : ret;
	}

	// Fully parse the shared object file.
	//
	// This function parses symbol versions. If a DSO has version information,
	// the file has a ".gnu.version_d" section which contains symbol version
	// definitions. Each symbol is associated to one version through a table in
	// ".gnu.version" section. That table is a parallel array for the symbol
	// table, and each table entry contains an index in ".gnu.version_d".
	//
	// The special index 0 is reserved for VERF_NDX_LOCAL and 1 is for
	// VER_NDX_GLOBAL. There's no table entry for these special versions in
	// ".gnu.version_d".
	//
	// The file format for symbol versioning is perhaps a bit more complicated
	// than necessary, but you can easily understand the code if you wrap your
	// head around the data structure described above.
	template <class ELFT> void SharedFile::parse() {
	using Elf_Dyn = typename ELFT::Dyn;
	using Elf_Shdr = typename ELFT::Shdr;
	using Elf_Sym = typename ELFT::Sym;
	using Elf_Verdef = typename ELFT::Verdef;
	using Elf_Versym = typename ELFT::Versym;

	ArrayRef<Elf_Dyn> dynamicTags;
	const ELFFile<ELFT> obj = this->getObj<ELFT>();
	ArrayRef<Elf_Shdr> sections = CHECK(obj.sections(), this);

	const Elf_Shdr *versymSec = nullptr;
	const Elf_Shdr *verdefSec = nullptr;

	// Search for .dynsym, .dynamic, .symtab, .gnu.version and .gnu.version_d.
	for (const Elf_Shdr &sec : sections) {
	switch (sec.sh_type) {
	default:
	continue;
	case SHT_DYNAMIC:
	dynamicTags =
	CHECK(obj.template getSectionContentsAsArray<Elf_Dyn>(&sec), this);
	break;
	case SHT_GNU_versym:
	versymSec = &sec;
	break;
	case SHT_GNU_verdef:
	verdefSec = &sec;
	break;
	}
	}

	if (versymSec && numELFSyms == 0) {
	error("SHT_GNU_versym should be associated with symbol table");
	return;
	}

	// Search for a DT_SONAME tag to initialize this->soName.
	for (const Elf_Dyn &dyn : dynamicTags) {
	if (dyn.d_tag == DT_NEEDED) {
	uint64_t val = dyn.getVal();
	if (val >= this->stringTable.size())
	fatal(toString(this) + ": invalid DT_NEEDED entry");
	dtNeeded.push_back(this->stringTable.data() + val);
	} else if (dyn.d_tag == DT_SONAME) {
	uint64_t val = dyn.getVal();
	if (val >= this->stringTable.size())
	fatal(toString(this) + ": invalid DT_SONAME entry");
	soName = this->stringTable.data() + val;
	}
	}

	// DSOs are uniquified not by filename but by soname.
	DenseMap<StringRef, SharedFile *>::iterator it;
	bool wasInserted;
	std::tie(it, wasInserted) = symtab->soNames.try_emplace(soName, this);

	// If a DSO appears more than once on the command line with and without
	// --as-needed, --no-as-needed takes precedence over --as-needed because a
	// user can add an extra DSO with --no-as-needed to force it to be added to
	// the dependency list.
	it->second->isNeeded \|= isNeeded;
	if (!wasInserted)
	return;

	sharedFiles.push_back(this);

	verdefs = parseVerdefs<ELFT>(obj.base(), verdefSec);

	// Parse ".gnu.version" section which is a parallel array for the symbol
	// table. If a given file doesn't have a ".gnu.version" section, we use
	// VER_NDX_GLOBAL.
	size_t size = numELFSyms - firstGlobal;
	std::vector<uint32_t> versyms(size, VER_NDX_GLOBAL);
	if (versymSec) {
	ArrayRef<Elf_Versym> versym =
	CHECK(obj.template getSectionContentsAsArray<Elf_Versym>(versymSec),
	this)
	.slice(firstGlobal);
	for (size_t i = 0; i < size; ++i)
	versyms[i] = versym[i].vs_index;
	}

	// System libraries can have a lot of symbols with versions. Using a
	// fixed buffer for computing the versions name (foo@ver) can save a
	// lot of allocations.
	SmallString<0> versionedNameBuffer;

	// Add symbols to the symbol table.
	ArrayRef<Elf_Sym> syms = this->getGlobalELFSyms<ELFT>();
	for (size_t i = 0; i < syms.size(); ++i) {
	const Elf_Sym &sym = syms[i];

	// ELF spec requires that all local symbols precede weak or global
	// symbols in each symbol table, and the index of first non-local symbol
	// is stored to sh_info. If a local symbol appears after some non-local
	// symbol, that's a violation of the spec.
	StringRef name = CHECK(sym.getName(this->stringTable), this);
	if (sym.getBinding() == STB_LOCAL) {
	warn("found local symbol '" + name +
	"' in global part of symbol table in file " + toString(this));
	continue;
	}

	if (sym.isUndefined()) {
	Symbol *s = symtab->addSymbol(
	Undefined{this, name, sym.getBinding(), sym.st_other, sym.getType()});
	s->exportDynamic = true;
	continue;
	}

	// MIPS BFD linker puts _gp_disp symbol into DSO files and incorrectly
	// assigns VER_NDX_LOCAL to this section global symbol. Here is a
	// workaround for this bug.
	uint32_t idx = versyms[i] & ~VERSYM_HIDDEN;
	if (config->emachine == EM_MIPS && idx == VER_NDX_LOCAL &&
	name == "_gp_disp")
	continue;

	uint32_t alignment = getAlignment<ELFT>(sections, sym);
	if (!(versyms[i] & VERSYM_HIDDEN)) {
	symtab->addSymbol(SharedSymbol{*this, name, sym.getBinding(),
	sym.st_other, sym.getType(), sym.st_value,
	sym.st_size, alignment, idx});
	}

	// Also add the symbol with the versioned name to handle undefined symbols
	// with explicit versions.
	if (idx == VER_NDX_GLOBAL)
	continue;

	if (idx >= verdefs.size() \|\| idx == VER_NDX_LOCAL) {
	error("corrupt input file: version definition index " + Twine(idx) +
	" for symbol " + name + " is out of bounds\n>>> defined in " +
	toString(this));
	continue;
	}

	StringRef verName =
	this->stringTable.data() +
	reinterpret_cast<const Elf_Verdef *>(verdefs[idx])->getAux()->vda_name;
	versionedNameBuffer.clear();
	name = (name + "@" + verName).toStringRef(versionedNameBuffer);
	symtab->addSymbol(SharedSymbol{*this, saver.save(name), sym.getBinding(),
	sym.st_other, sym.getType(), sym.st_value,
	sym.st_size, alignment, idx});
	}
	}

	static ELFKind getBitcodeELFKind(const Triple &t) {
	if (t.isLittleEndian())
	return t.isArch64Bit() ? ELF64LEKind : ELF32LEKind;
	return t.isArch64Bit() ? ELF64BEKind : ELF32BEKind;
	}

	static uint8_t getBitcodeMachineKind(StringRef path, const Triple &t) {
	switch (t.getArch()) {
	case Triple::aarch64:
	return EM_AARCH64;
	case Triple::amdgcn:
	case Triple::r600:
	return EM_AMDGPU;
	case Triple::arm:
	case Triple::thumb:
	return EM_ARM;
	case Triple::avr:
	return EM_AVR;
	case Triple::mips:
	case Triple::mipsel:
	case Triple::mips64:
	case Triple::mips64el:
	return EM_MIPS;
	case Triple::msp430:
	return EM_MSP430;
	case Triple::ppc:
	return EM_PPC;
	case Triple::ppc64:
	case Triple::ppc64le:
	return EM_PPC64;
	case Triple::riscv32:
	case Triple::riscv64:
	return EM_RISCV;
	case Triple::x86:
	return t.isOSIAMCU() ? EM_IAMCU : EM_386;
	case Triple::x86_64:
	return EM_X86_64;
	default:
	error(path + ": could not infer e_machine from bitcode target triple " +
	t.str());
	return EM_NONE;
	}
	}

	BitcodeFile::BitcodeFile(MemoryBufferRef mb, StringRef archiveName,
	uint64_t offsetInArchive)
	: InputFile(BitcodeKind, mb) {
	this->archiveName = archiveName;

	std::string path = mb.getBufferIdentifier().str();
	if (config->thinLTOIndexOnly)
	path = replaceThinLTOSuffix(mb.getBufferIdentifier());

	// ThinLTO assumes that all MemoryBufferRefs given to it have a unique
	// name. If two archives define two members with the same name, this
	// causes a collision which result in only one of the objects being taken
	// into consideration at LTO time (which very likely causes undefined
	// symbols later in the link stage). So we append file offset to make
	// filename unique.
	StringRef name = archiveName.empty()
	? saver.save(path)
	: saver.save(archiveName + "(" + path + " at " +
	utostr(offsetInArchive) + ")");
	MemoryBufferRef mbref(mb.getBuffer(), name);

	obj = CHECK(lto::InputFile::create(mbref), this);

	Triple t(obj->getTargetTriple());
	ekind = getBitcodeELFKind(t);
	emachine = getBitcodeMachineKind(mb.getBufferIdentifier(), t);
	}

	static uint8_t mapVisibility(GlobalValue::VisibilityTypes gvVisibility) {
	switch (gvVisibility) {
	case GlobalValue::DefaultVisibility:
	return STV_DEFAULT;
	case GlobalValue::HiddenVisibility:
	return STV_HIDDEN;
	case GlobalValue::ProtectedVisibility:
	return STV_PROTECTED;
	}
	llvm_unreachable("unknown visibility");
	}

	template <class ELFT>
	static Symbol *createBitcodeSymbol(const std::vector<bool> &keptComdats,
	const lto::InputFile::Symbol &objSym,
	BitcodeFile &f) {
	StringRef name = saver.save(objSym.getName());
	uint8_t binding = objSym.isWeak() ? STB_WEAK : STB_GLOBAL;
	uint8_t type = objSym.isTLS() ? STT_TLS : STT_NOTYPE;
	uint8_t visibility = mapVisibility(objSym.getVisibility());
	bool canOmitFromDynSym = objSym.canBeOmittedFromSymbolTable();

	int c = objSym.getComdatIndex();
	if (objSym.isUndefined() \|\| (c != -1 && !keptComdats[c])) {
	Undefined New(&f, name, binding, visibility, type);
	if (canOmitFromDynSym)
	New.exportDynamic = false;
	return symtab->addSymbol(New);
	}

	if (objSym.isCommon())
	return symtab->addSymbol(
	CommonSymbol{&f, name, binding, visibility, STT_OBJECT,
	objSym.getCommonAlignment(), objSym.getCommonSize()});

	Defined New(&f, name, binding, visibility, type, 0, 0, nullptr);
	if (canOmitFromDynSym)
	New.exportDynamic = false;
	return symtab->addSymbol(New);
	}

	template <class ELFT> void BitcodeFile::parse() {
	std::vector<bool> keptComdats;
	for (StringRef s : obj->getComdatTable())
	keptComdats.push_back(
	symtab->comdatGroups.try_emplace(CachedHashStringRef(s), this).second);

	for (const lto::InputFile::Symbol &objSym : obj->symbols())
	symbols.push_back(createBitcodeSymbol<ELFT>(keptComdats, objSym, *this));

	for (auto l : obj->getDependentLibraries())
	addDependentLibrary(l, this);
	}

	void BinaryFile::parse() {
	ArrayRef<uint8_t> data = arrayRefFromStringRef(mb.getBuffer());
	auto *section = make<InputSection>(this, SHF_ALLOC \| SHF_WRITE, SHT_PROGBITS,
	8, data, ".data");
	sections.push_back(section);

	// For each input file foo that is embedded to a result as a binary
	// blob, we define _binary_foo_{start,end,size} symbols, so that
	// user programs can access blobs by name. Non-alphanumeric
	// characters in a filename are replaced with underscore.
	std::string s = "_binary_" + mb.getBufferIdentifier().str();
	for (size_t i = 0; i < s.size(); ++i)
	if (!isAlnum(s[i]))
	s[i] = '_';

	symtab->addSymbol(Defined{nullptr, saver.save(s + "_start"), STB_GLOBAL,
	STV_DEFAULT, STT_OBJECT, 0, 0, section});
	symtab->addSymbol(Defined{nullptr, saver.save(s + "_end"), STB_GLOBAL,
	STV_DEFAULT, STT_OBJECT, data.size(), 0, section});
	symtab->addSymbol(Defined{nullptr, saver.save(s + "_size"), STB_GLOBAL,
	STV_DEFAULT, STT_OBJECT, data.size(), 0, nullptr});
	}

	InputFile *elf::createObjectFile(MemoryBufferRef mb, StringRef archiveName,
	uint64_t offsetInArchive) {
	if (isBitcode(mb))
	return make<BitcodeFile>(mb, archiveName, offsetInArchive);

	switch (getELFKind(mb, archiveName)) {
	case ELF32LEKind:
	return make<ObjFile<ELF32LE>>(mb, archiveName);
	case ELF32BEKind:
	return make<ObjFile<ELF32BE>>(mb, archiveName);
	case ELF64LEKind:
	return make<ObjFile<ELF64LE>>(mb, archiveName);
	case ELF64BEKind:
	return make<ObjFile<ELF64BE>>(mb, archiveName);
	default:
	llvm_unreachable("getELFKind");
	}
	}

	void LazyObjFile::fetch() {
	if (mb.getBuffer().empty())
	return;

	InputFile *file = createObjectFile(mb, archiveName, offsetInArchive);
	file->groupId = groupId;

	mb = {};

	// Copy symbol vector so that the new InputFile doesn't have to
	// insert the same defined symbols to the symbol table again.
	file->symbols = std::move(symbols);

	parseFile(file);
	}

	template <class ELFT> void LazyObjFile::parse() {
	using Elf_Sym = typename ELFT::Sym;

	// A lazy object file wraps either a bitcode file or an ELF file.
	if (isBitcode(this->mb)) {
	std::unique_ptr<lto::InputFile> obj =
	CHECK(lto::InputFile::create(this->mb), this);
	for (const lto::InputFile::Symbol &sym : obj->symbols()) {
	if (sym.isUndefined())
	continue;
	symtab->addSymbol(LazyObject{*this, saver.save(sym.getName())});
	}
	return;
	}

	if (getELFKind(this->mb, archiveName) != config->ekind) {
	error("incompatible file: " + this->mb.getBufferIdentifier());
	return;
	}

	// Find a symbol table.
	ELFFile<ELFT> obj = check(ELFFile<ELFT>::create(mb.getBuffer()));
	ArrayRef<typename ELFT::Shdr> sections = CHECK(obj.sections(), this);

	for (const typename ELFT::Shdr &sec : sections) {
	if (sec.sh_type != SHT_SYMTAB)
	continue;

	// A symbol table is found.
	ArrayRef<Elf_Sym> eSyms = CHECK(obj.symbols(&sec), this);
	uint32_t firstGlobal = sec.sh_info;
	StringRef strtab = CHECK(obj.getStringTableForSymtab(sec, sections), this);
	this->symbols.resize(eSyms.size());

	// Get existing symbols or insert placeholder symbols.
	for (size_t i = firstGlobal, end = eSyms.size(); i != end; ++i)
	if (eSyms[i].st_shndx != SHN_UNDEF)
	this->symbols[i] = symtab->insert(CHECK(eSyms[i].getName(strtab), this));

	// Replace existing symbols with LazyObject symbols.
	//
	// resolve() may trigger this->fetch() if an existing symbol is an
	// undefined symbol. If that happens, this LazyObjFile has served
	// its purpose, and we can exit from the loop early.
	for (Symbol *sym : this->symbols) {
	if (!sym)
	continue;
	sym->resolve(LazyObject{*this, sym->getName()});

	// MemoryBuffer is emptied if this file is instantiated as ObjFile.
	if (mb.getBuffer().empty())
	return;
	}
	return;
	}
	}

	std::string elf::replaceThinLTOSuffix(StringRef path) {
	StringRef suffix = config->thinLTOObjectSuffixReplace.first;
	StringRef repl = config->thinLTOObjectSuffixReplace.second;

	if (path.consume_back(suffix))
	return (path + repl).str();
	return path;
	}

	template void BitcodeFile::parse<ELF32LE>();
	template void BitcodeFile::parse<ELF32BE>();
	template void BitcodeFile::parse<ELF64LE>();
	template void BitcodeFile::parse<ELF64BE>();

	template void LazyObjFile::parse<ELF32LE>();
	template void LazyObjFile::parse<ELF32BE>();
	template void LazyObjFile::parse<ELF64LE>();
	template void LazyObjFile::parse<ELF64BE>();

	template class elf::ObjFile<ELF32LE>;
	template class elf::ObjFile<ELF32BE>;
	template class elf::ObjFile<ELF64LE>;
	template class elf::ObjFile<ELF64BE>;

	template void SharedFile::parse<ELF32LE>();
	template void SharedFile::parse<ELF32BE>();
	template void SharedFile::parse<ELF64LE>();
	template void SharedFile::parse<ELF64BE>();
	Index: projects/clang900-import/contrib/llvm/tools/lld/ELF/Symbols.cpp
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/ELF/Symbols.cpp (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/ELF/Symbols.cpp (revision 351722)
	@@ -1,656 +1,663 @@
	//===- Symbols.cpp --------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "Symbols.h"
	#include "InputFiles.h"
	#include "InputSection.h"
	#include "OutputSections.h"
	#include "SyntheticSections.h"
	#include "Target.h"
	#include "Writer.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Strings.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/Support/Path.h"
	#include <cstring>

	using namespace llvm;
	using namespace llvm::object;
	using namespace llvm::ELF;

	using namespace lld;
	using namespace lld::elf;

	Defined *ElfSym::bss;
	Defined *ElfSym::etext1;
	Defined *ElfSym::etext2;
	Defined *ElfSym::edata1;
	Defined *ElfSym::edata2;
	Defined *ElfSym::end1;
	Defined *ElfSym::end2;
	Defined *ElfSym::globalOffsetTable;
	Defined *ElfSym::mipsGp;
	Defined *ElfSym::mipsGpDisp;
	Defined *ElfSym::mipsLocalGp;
	Defined *ElfSym::relaIpltStart;
	Defined *ElfSym::relaIpltEnd;
	Defined *ElfSym::riscvGlobalPointer;
	Defined *ElfSym::tlsModuleBase;

	+// Returns a symbol for an error message.
	+static std::string demangle(StringRef symName) {
	+ if (config->demangle)
	+ if (Optional<std::string> s = demangleItanium(symName))
	+ return *s;
	+ return symName;
	+}
	+namespace lld {
	+std::string toString(const Symbol &b) { return demangle(b.getName()); }
	+std::string toELFString(const Archive::Symbol &b) {
	+ return demangle(b.getName());
	+}
	+} // namespace lld
	+
	static uint64_t getSymVA(const Symbol &sym, int64_t &addend) {
	switch (sym.kind()) {
	case Symbol::DefinedKind: {
	auto &d = cast<Defined>(sym);
	SectionBase *isec = d.section;

	// This is an absolute symbol.
	if (!isec)
	return d.value;

	assert(isec != &InputSection::discarded);
	isec = isec->repl;

	uint64_t offset = d.value;

	// An object in an SHF_MERGE section might be referenced via a
	// section symbol (as a hack for reducing the number of local
	// symbols).
	// Depending on the addend, the reference via a section symbol
	// refers to a different object in the merge section.
	// Since the objects in the merge section are not necessarily
	// contiguous in the output, the addend can thus affect the final
	// VA in a non-linear way.
	// To make this work, we incorporate the addend into the section
	// offset (and zero out the addend for later processing) so that
	// we find the right object in the section.
	if (d.isSection()) {
	offset += addend;
	addend = 0;
	}

	// In the typical case, this is actually very simple and boils
	// down to adding together 3 numbers:
	// 1. The address of the output section.
	// 2. The offset of the input section within the output section.
	// 3. The offset within the input section (this addition happens
	// inside InputSection::getOffset).
	//
	// If you understand the data structures involved with this next
	// line (and how they get built), then you have a pretty good
	// understanding of the linker.
	uint64_t va = isec->getVA(offset);

	// MIPS relocatable files can mix regular and microMIPS code.
	// Linker needs to distinguish such code. To do so microMIPS
	// symbols has the `STO_MIPS_MICROMIPS` flag in the `st_other`
	// field. Unfortunately, the `MIPS::relocateOne()` method has
	// a symbol value only. To pass type of the symbol (regular/microMIPS)
	// to that routine as well as other places where we write
	// a symbol value as-is (.dynamic section, `Elf_Ehdr::e_entry`
	// field etc) do the same trick as compiler uses to mark microMIPS
	// for CPU - set the less-significant bit.
	if (config->emachine == EM_MIPS && isMicroMips() &&
	((sym.stOther & STO_MIPS_MICROMIPS) \|\| sym.needsPltAddr))
	va \|= 1;

	if (d.isTls() && !config->relocatable) {
	// Use the address of the TLS segment's first section rather than the
	// segment's address, because segment addresses aren't initialized until
	// after sections are finalized. (e.g. Measuring the size of .rela.dyn
	// for Android relocation packing requires knowing TLS symbol addresses
	// during section finalization.)
	if (!Out::tlsPhdr \|\| !Out::tlsPhdr->firstSec)
	fatal(toString(d.file) +
	" has an STT_TLS symbol but doesn't have an SHF_TLS section");
	return va - Out::tlsPhdr->firstSec->addr;
	}
	return va;
	}
	case Symbol::SharedKind:
	case Symbol::UndefinedKind:
	return 0;
	case Symbol::LazyArchiveKind:
	case Symbol::LazyObjectKind:
	assert(sym.isUsedInRegularObj && "lazy symbol reached writer");
	return 0;
	case Symbol::CommonKind:
	llvm_unreachable("common symbol reached writer");
	case Symbol::PlaceholderKind:
	llvm_unreachable("placeholder symbol reached writer");
	}
	llvm_unreachable("invalid symbol kind");
	}

	uint64_t Symbol::getVA(int64_t addend) const {
	uint64_t outVA = getSymVA(*this, addend);
	return outVA + addend;
	}

	uint64_t Symbol::getGotVA() const {
	if (gotInIgot)
	return in.igotPlt->getVA() + getGotPltOffset();
	return in.got->getVA() + getGotOffset();
	}

	uint64_t Symbol::getGotOffset() const { return gotIndex * config->wordsize; }

	uint64_t Symbol::getGotPltVA() const {
	if (isInIplt)
	return in.igotPlt->getVA() + getGotPltOffset();
	return in.gotPlt->getVA() + getGotPltOffset();
	}

	uint64_t Symbol::getGotPltOffset() const {
	if (isInIplt)
	return pltIndex * config->wordsize;
	return (pltIndex + target->gotPltHeaderEntriesNum) * config->wordsize;
	}

	uint64_t Symbol::getPPC64LongBranchOffset() const {
	assert(ppc64BranchltIndex != 0xffff);
	return ppc64BranchltIndex * config->wordsize;
	}

	uint64_t Symbol::getPltVA() const {
	PltSection *plt = isInIplt ? in.iplt : in.plt;
	uint64_t outVA =
	plt->getVA() + plt->headerSize + pltIndex * target->pltEntrySize;
	// While linking microMIPS code PLT code are always microMIPS
	// code. Set the less-significant bit to track that fact.
	// See detailed comment in the `getSymVA` function.
	if (config->emachine == EM_MIPS && isMicroMips())
	outVA \|= 1;
	return outVA;
	}

	uint64_t Symbol::getPPC64LongBranchTableVA() const {
	assert(ppc64BranchltIndex != 0xffff);
	return in.ppc64LongBranchTarget->getVA() +
	ppc64BranchltIndex * config->wordsize;
	}

	uint64_t Symbol::getSize() const {
	if (const auto *dr = dyn_cast<Defined>(this))
	return dr->size;
	return cast<SharedSymbol>(this)->size;
	}

	OutputSection *Symbol::getOutputSection() const {
	if (auto *s = dyn_cast<Defined>(this)) {
	if (auto *sec = s->section)
	return sec->repl->getOutputSection();
	return nullptr;
	}
	return nullptr;
	}

	// If a symbol name contains '@', the characters after that is
	// a symbol version name. This function parses that.
	void Symbol::parseSymbolVersion() {
	StringRef s = getName();
	size_t pos = s.find('@');
	if (pos == 0 \|\| pos == StringRef::npos)
	return;
	StringRef verstr = s.substr(pos + 1);
	if (verstr.empty())
	return;

	// Truncate the symbol name so that it doesn't include the version string.
	nameSize = pos;

	// If this is not in this DSO, it is not a definition.
	if (!isDefined())
	return;

	// '@@' in a symbol name means the default version.
	// It is usually the most recent one.
	bool isDefault = (verstr[0] == '@');
	if (isDefault)
	verstr = verstr.substr(1);

	for (VersionDefinition &ver : config->versionDefinitions) {
	if (ver.name != verstr)
	continue;

	if (isDefault)
	versionId = ver.id;
	else
	versionId = ver.id \| VERSYM_HIDDEN;
	return;
	}

	// It is an error if the specified version is not defined.
	// Usually version script is not provided when linking executable,
	// but we may still want to override a versioned symbol from DSO,
	// so we do not report error in this case. We also do not error
	// if the symbol has a local version as it won't be in the dynamic
	// symbol table.
	if (config->shared && versionId != VER_NDX_LOCAL)
	error(toString(file) + ": symbol " + s + " has undefined version " +
	verstr);
	}

	void Symbol::fetch() const {
	if (auto *sym = dyn_cast<LazyArchive>(this)) {
	cast<ArchiveFile>(sym->file)->fetch(sym->sym);
	return;
	}

	if (auto *sym = dyn_cast<LazyObject>(this)) {
	dyn_cast<LazyObjFile>(sym->file)->fetch();
	return;
	}

	llvm_unreachable("Symbol::fetch() is called on a non-lazy symbol");
	}

	MemoryBufferRef LazyArchive::getMemberBuffer() {
	- Archive::Child c = CHECK(
	- sym.getMember(), "could not get the member for symbol " + sym.getName());
	+ Archive::Child c =
	+ CHECK(sym.getMember(),
	+ "could not get the member for symbol " + toELFString(sym));

	return CHECK(c.getMemoryBufferRef(),
	"could not get the buffer for the member defining symbol " +
	- sym.getName());
	+ toELFString(sym));
	}

	uint8_t Symbol::computeBinding() const {
	if (config->relocatable)
	return binding;
	if (visibility != STV_DEFAULT && visibility != STV_PROTECTED)
	return STB_LOCAL;
	if (versionId == VER_NDX_LOCAL && isDefined() && !isPreemptible)
	return STB_LOCAL;
	if (!config->gnuUnique && binding == STB_GNU_UNIQUE)
	return STB_GLOBAL;
	return binding;
	}

	bool Symbol::includeInDynsym() const {
	if (!config->hasDynSymTab)
	return false;
	if (computeBinding() == STB_LOCAL)
	return false;

	// If a PIE binary was not linked against any shared libraries, then we can
	// safely drop weak undef symbols from .dynsym.
	if (isUndefWeak() && config->pie && sharedFiles.empty())
	return false;

	return isUndefined() \|\| isShared() \|\| exportDynamic;
	}

	// Print out a log message for --trace-symbol.
	void elf::printTraceSymbol(const Symbol *sym) {
	std::string s;
	if (sym->isUndefined())
	s = ": reference to ";
	else if (sym->isLazy())
	s = ": lazy definition of ";
	else if (sym->isShared())
	s = ": shared definition of ";
	else if (sym->isCommon())
	s = ": common definition of ";
	else
	s = ": definition of ";

	message(toString(sym->file) + s + sym->getName());
	}

	void elf::maybeWarnUnorderableSymbol(const Symbol *sym) {
	if (!config->warnSymbolOrdering)
	return;

	// If UnresolvedPolicy::Ignore is used, no "undefined symbol" error/warning
	// is emitted. It makes sense to not warn on undefined symbols.
	//
	// Note, ld.bfd --symbol-ordering-file= does not warn on undefined symbols,
	// but we don't have to be compatible here.
	if (sym->isUndefined() &&
	config->unresolvedSymbols == UnresolvedPolicy::Ignore)
	return;

	const InputFile *file = sym->file;
	auto *d = dyn_cast<Defined>(sym);

	auto report = [&](StringRef s) { warn(toString(file) + s + sym->getName()); };

	if (sym->isUndefined())
	report(": unable to order undefined symbol: ");
	else if (sym->isShared())
	report(": unable to order shared symbol: ");
	else if (d && !d->section)
	report(": unable to order absolute symbol: ");
	else if (d && isa<OutputSection>(d->section))
	report(": unable to order synthetic symbol: ");
	else if (d && !d->section->repl->isLive())
	report(": unable to order discarded symbol: ");
	-}
	-
	-// Returns a symbol for an error message.
	-std::string lld::toString(const Symbol &b) {
	- if (config->demangle)
	- if (Optional<std::string> s = demangleItanium(b.getName()))
	- return *s;
	- return b.getName();
	}

	static uint8_t getMinVisibility(uint8_t va, uint8_t vb) {
	if (va == STV_DEFAULT)
	return vb;
	if (vb == STV_DEFAULT)
	return va;
	return std::min(va, vb);
	}

	// Merge symbol properties.
	//
	// When we have many symbols of the same name, we choose one of them,
	// and that's the result of symbol resolution. However, symbols that
	// were not chosen still affect some symbol properties.
	void Symbol::mergeProperties(const Symbol &other) {
	if (other.exportDynamic)
	exportDynamic = true;
	if (other.isUsedInRegularObj)
	isUsedInRegularObj = true;

	// DSO symbols do not affect visibility in the output.
	if (!other.isShared())
	visibility = getMinVisibility(visibility, other.visibility);
	}

	void Symbol::resolve(const Symbol &other) {
	mergeProperties(other);

	if (isPlaceholder()) {
	replace(other);
	return;
	}

	switch (other.kind()) {
	case Symbol::UndefinedKind:
	resolveUndefined(cast<Undefined>(other));
	break;
	case Symbol::CommonKind:
	resolveCommon(cast<CommonSymbol>(other));
	break;
	case Symbol::DefinedKind:
	resolveDefined(cast<Defined>(other));
	break;
	case Symbol::LazyArchiveKind:
	resolveLazy(cast<LazyArchive>(other));
	break;
	case Symbol::LazyObjectKind:
	resolveLazy(cast<LazyObject>(other));
	break;
	case Symbol::SharedKind:
	resolveShared(cast<SharedSymbol>(other));
	break;
	case Symbol::PlaceholderKind:
	llvm_unreachable("bad symbol kind");
	}
	}

	void Symbol::resolveUndefined(const Undefined &other) {
	// An undefined symbol with non default visibility must be satisfied
	// in the same DSO.
	//
	// If this is a non-weak defined symbol in a discarded section, override the
	// existing undefined symbol for better error message later.
	if ((isShared() && other.visibility != STV_DEFAULT) \|\|
	(isUndefined() && other.binding != STB_WEAK && other.discardedSecIdx)) {
	replace(other);
	return;
	}

	if (traced)
	printTraceSymbol(&other);

	if (isLazy()) {
	// An undefined weak will not fetch archive members. See comment on Lazy in
	// Symbols.h for the details.
	if (other.binding == STB_WEAK) {
	binding = STB_WEAK;
	type = other.type;
	return;
	}

	// Do extra check for --warn-backrefs.
	//
	// --warn-backrefs is an option to prevent an undefined reference from
	// fetching an archive member written earlier in the command line. It can be
	// used to keep compatibility with GNU linkers to some degree.
	// I'll explain the feature and why you may find it useful in this comment.
	//
	// lld's symbol resolution semantics is more relaxed than traditional Unix
	// linkers. For example,
	//
	// ld.lld foo.a bar.o
	//
	// succeeds even if bar.o contains an undefined symbol that has to be
	// resolved by some object file in foo.a. Traditional Unix linkers don't
	// allow this kind of backward reference, as they visit each file only once
	// from left to right in the command line while resolving all undefined
	// symbols at the moment of visiting.
	//
	// In the above case, since there's no undefined symbol when a linker visits
	// foo.a, no files are pulled out from foo.a, and because the linker forgets
	// about foo.a after visiting, it can't resolve undefined symbols in bar.o
	// that could have been resolved otherwise.
	//
	// That lld accepts more relaxed form means that (besides it'd make more
	// sense) you can accidentally write a command line or a build file that
	// works only with lld, even if you have a plan to distribute it to wider
	// users who may be using GNU linkers. With --warn-backrefs, you can detect
	// a library order that doesn't work with other Unix linkers.
	//
	// The option is also useful to detect cyclic dependencies between static
	// archives. Again, lld accepts
	//
	// ld.lld foo.a bar.a
	//
	// even if foo.a and bar.a depend on each other. With --warn-backrefs, it is
	// handled as an error.
	//
	// Here is how the option works. We assign a group ID to each file. A file
	// with a smaller group ID can pull out object files from an archive file
	// with an equal or greater group ID. Otherwise, it is a reverse dependency
	// and an error.
	//
	// A file outside --{start,end}-group gets a fresh ID when instantiated. All
	// files within the same --{start,end}-group get the same group ID. E.g.
	//
	// ld.lld A B --start-group C D --end-group E
	//
	// A forms group 0. B form group 1. C and D (including their member object
	// files) form group 2. E forms group 3. I think that you can see how this
	// group assignment rule simulates the traditional linker's semantics.
	bool backref = config->warnBackrefs && other.file &&
	file->groupId < other.file->groupId;
	fetch();

	// We don't report backward references to weak symbols as they can be
	// overridden later.
	if (backref && !isWeak())
	warn("backward reference detected: " + other.getName() + " in " +
	toString(other.file) + " refers to " + toString(file));
	return;
	}

	// Undefined symbols in a SharedFile do not change the binding.
	if (dyn_cast_or_null<SharedFile>(other.file))
	return;

	if (isUndefined()) {
	// The binding may "upgrade" from weak to non-weak.
	if (other.binding != STB_WEAK)
	binding = other.binding;
	} else if (auto *s = dyn_cast<SharedSymbol>(this)) {
	// The binding of a SharedSymbol will be weak if there is at least one
	// reference and all are weak. The binding has one opportunity to change to
	// weak: if the first reference is weak.
	if (other.binding != STB_WEAK \|\| !s->referenced)
	binding = other.binding;
	s->referenced = true;
	}
	}

	// Using .symver foo,foo@@VER unfortunately creates two symbols: foo and
	// foo@@VER. We want to effectively ignore foo, so give precedence to
	// foo@@VER.
	// FIXME: If users can transition to using
	// .symver foo,foo@@@VER
	// we can delete this hack.
	static int compareVersion(StringRef a, StringRef b) {
	bool x = a.contains("@@");
	bool y = b.contains("@@");
	if (!x && y)
	return 1;
	if (x && !y)
	return -1;
	return 0;
	}

	// Compare two symbols. Return 1 if the new symbol should win, -1 if
	// the new symbol should lose, or 0 if there is a conflict.
	int Symbol::compare(const Symbol *other) const {
	assert(other->isDefined() \|\| other->isCommon());

	if (!isDefined() && !isCommon())
	return 1;

	if (int cmp = compareVersion(getName(), other->getName()))
	return cmp;

	if (other->isWeak())
	return -1;

	if (isWeak())
	return 1;

	if (isCommon() && other->isCommon()) {
	if (config->warnCommon)
	warn("multiple common of " + getName());
	return 0;
	}

	if (isCommon()) {
	if (config->warnCommon)
	warn("common " + getName() + " is overridden");
	return 1;
	}

	if (other->isCommon()) {
	if (config->warnCommon)
	warn("common " + getName() + " is overridden");
	return -1;
	}

	auto *oldSym = cast<Defined>(this);
	auto *newSym = cast<Defined>(other);

	if (other->file && isa<BitcodeFile>(other->file))
	return 0;

	if (!oldSym->section && !newSym->section && oldSym->value == newSym->value &&
	newSym->binding == STB_GLOBAL)
	return -1;

	return 0;
	}

	static void reportDuplicate(Symbol sym, InputFile newFile,
	InputSectionBase *errSec, uint64_t errOffset) {
	if (config->allowMultipleDefinition)
	return;

	Defined *d = cast<Defined>(sym);
	if (!d->section \|\| !errSec) {
	error("duplicate symbol: " + toString(*sym) + "\n>>> defined in " +
	toString(sym->file) + "\n>>> defined in " + toString(newFile));
	return;
	}

	// Construct and print an error message in the form of:
	//
	// ld.lld: error: duplicate symbol: foo
	// >>> defined at bar.c:30
	// >>> bar.o (/home/alice/src/bar.o)
	// >>> defined at baz.c:563
	// >>> baz.o in archive libbaz.a
	auto *sec1 = cast<InputSectionBase>(d->section);
	std::string src1 = sec1->getSrcMsg(*sym, d->value);
	std::string obj1 = sec1->getObjMsg(d->value);
	std::string src2 = errSec->getSrcMsg(*sym, errOffset);
	std::string obj2 = errSec->getObjMsg(errOffset);

	std::string msg = "duplicate symbol: " + toString(*sym) + "\n>>> defined at ";
	if (!src1.empty())
	msg += src1 + "\n>>> ";
	msg += obj1 + "\n>>> defined at ";
	if (!src2.empty())
	msg += src2 + "\n>>> ";
	msg += obj2;
	error(msg);
	}

	void Symbol::resolveCommon(const CommonSymbol &other) {
	int cmp = compare(&other);
	if (cmp < 0)
	return;

	if (cmp > 0) {
	replace(other);
	return;
	}

	CommonSymbol *oldSym = cast<CommonSymbol>(this);

	oldSym->alignment = std::max(oldSym->alignment, other.alignment);
	if (oldSym->size < other.size) {
	oldSym->file = other.file;
	oldSym->size = other.size;
	}
	}

	void Symbol::resolveDefined(const Defined &other) {
	int cmp = compare(&other);
	if (cmp > 0)
	replace(other);
	else if (cmp == 0)
	reportDuplicate(this, other.file,
	dyn_cast_or_null<InputSectionBase>(other.section),
	other.value);
	}

	template <class LazyT> void Symbol::resolveLazy(const LazyT &other) {
	if (!isUndefined())
	return;

	// An undefined weak will not fetch archive members. See comment on Lazy in
	// Symbols.h for the details.
	if (isWeak()) {
	uint8_t ty = type;
	replace(other);
	type = ty;
	binding = STB_WEAK;
	return;
	}

	other.fetch();
	}

	void Symbol::resolveShared(const SharedSymbol &other) {
	if (visibility == STV_DEFAULT && (isUndefined() \|\| isLazy())) {
	// An undefined symbol with non default visibility must be satisfied
	// in the same DSO.
	uint8_t bind = binding;
	replace(other);
	binding = bind;
	cast<SharedSymbol>(this)->referenced = true;
	}
	}
	Index: projects/clang900-import/contrib/llvm/tools/lld/ELF/Symbols.h
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/ELF/Symbols.h (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/ELF/Symbols.h (revision 351722)
	@@ -1,554 +1,558 @@
	//===- Symbols.h ------------------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines various types of Symbols.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLD_ELF_SYMBOLS_H
	#define LLD_ELF_SYMBOLS_H

	#include "InputFiles.h"
	#include "InputSection.h"
	#include "lld/Common/LLVM.h"
	#include "lld/Common/Strings.h"
	#include "llvm/Object/Archive.h"
	#include "llvm/Object/ELF.h"

	namespace lld {
	namespace elf {
	class CommonSymbol;
	class Defined;
	class InputFile;
	class LazyArchive;
	class LazyObject;
	class SharedSymbol;
	class Symbol;
	class Undefined;
	} // namespace elf

	std::string toString(const elf::Symbol &);
	-std::string toString(const elf::InputFile *);
	+
	+// There are two different ways to convert an Archive::Symbol to a string:
	+// One for Microsoft name mangling and one for Itanium name mangling.
	+// Call the functions toCOFFString and toELFString, not just toString.
	+std::string toELFString(const elf::Archive::Symbol &);

	namespace elf {

	// This is a StringRef-like container that doesn't run strlen().
	//
	// ELF string tables contain a lot of null-terminated strings. Most of them
	// are not necessary for the linker because they are names of local symbols,
	// and the linker doesn't use local symbol names for name resolution. So, we
	// use this class to represents strings read from string tables.
	struct StringRefZ {
	StringRefZ(const char *s) : data(s), size(-1) {}
	StringRefZ(StringRef s) : data(s.data()), size(s.size()) {}

	const char *data;
	const uint32_t size;
	};

	// The base class for real symbol classes.
	class Symbol {
	public:
	enum Kind {
	PlaceholderKind,
	DefinedKind,
	CommonKind,
	SharedKind,
	UndefinedKind,
	LazyArchiveKind,
	LazyObjectKind,
	};

	Kind kind() const { return static_cast<Kind>(symbolKind); }

	// The file from which this symbol was created.
	InputFile *file;

	protected:
	const char *nameData;
	mutable uint32_t nameSize;

	public:
	uint32_t dynsymIndex = 0;
	uint32_t gotIndex = -1;
	uint32_t pltIndex = -1;

	uint32_t globalDynIndex = -1;

	// This field is a index to the symbol's version definition.
	uint32_t verdefIndex = -1;

	// Version definition index.
	uint16_t versionId;

	// An index into the .branch_lt section on PPC64.
	uint16_t ppc64BranchltIndex = -1;

	// Symbol binding. This is not overwritten by replace() to track
	// changes during resolution. In particular:
	// - An undefined weak is still weak when it resolves to a shared library.
	// - An undefined weak will not fetch archive members, but we have to
	// remember it is weak.
	uint8_t binding;

	// The following fields have the same meaning as the ELF symbol attributes.
	uint8_t type; // symbol type
	uint8_t stOther; // st_other field value

	uint8_t symbolKind;

	// Symbol visibility. This is the computed minimum visibility of all
	// observed non-DSO symbols.
	unsigned visibility : 2;

	// True if the symbol was used for linking and thus need to be added to the
	// output file's symbol table. This is true for all symbols except for
	// unreferenced DSO symbols, lazy (archive) symbols, and bitcode symbols that
	// are unreferenced except by other bitcode objects.
	unsigned isUsedInRegularObj : 1;

	// If this flag is true and the symbol has protected or default visibility, it
	// will appear in .dynsym. This flag is set by interposable DSO symbols in
	// executables, by most symbols in DSOs and executables built with
	// --export-dynamic, and by dynamic lists.
	unsigned exportDynamic : 1;

	// False if LTO shouldn't inline whatever this symbol points to. If a symbol
	// is overwritten after LTO, LTO shouldn't inline the symbol because it
	// doesn't know the final contents of the symbol.
	unsigned canInline : 1;

	// True if this symbol is specified by --trace-symbol option.
	unsigned traced : 1;

	inline void replace(const Symbol &New);

	bool includeInDynsym() const;
	uint8_t computeBinding() const;
	bool isWeak() const { return binding == llvm::ELF::STB_WEAK; }

	bool isUndefined() const { return symbolKind == UndefinedKind; }
	bool isCommon() const { return symbolKind == CommonKind; }
	bool isDefined() const { return symbolKind == DefinedKind; }
	bool isShared() const { return symbolKind == SharedKind; }
	bool isPlaceholder() const { return symbolKind == PlaceholderKind; }

	bool isLocal() const { return binding == llvm::ELF::STB_LOCAL; }

	bool isLazy() const {
	return symbolKind == LazyArchiveKind \|\| symbolKind == LazyObjectKind;
	}

	// True if this is an undefined weak symbol. This only works once
	// all input files have been added.
	bool isUndefWeak() const {
	// See comment on lazy symbols for details.
	return isWeak() && (isUndefined() \|\| isLazy());
	}

	StringRef getName() const {
	if (nameSize == (uint32_t)-1)
	nameSize = strlen(nameData);
	return {nameData, nameSize};
	}

	void setName(StringRef s) {
	nameData = s.data();
	nameSize = s.size();
	}

	void parseSymbolVersion();

	bool isInGot() const { return gotIndex != -1U; }
	bool isInPlt() const { return pltIndex != -1U; }
	bool isInPPC64Branchlt() const { return ppc64BranchltIndex != 0xffff; }

	uint64_t getVA(int64_t addend = 0) const;

	uint64_t getGotOffset() const;
	uint64_t getGotVA() const;
	uint64_t getGotPltOffset() const;
	uint64_t getGotPltVA() const;
	uint64_t getPltVA() const;
	uint64_t getPPC64LongBranchTableVA() const;
	uint64_t getPPC64LongBranchOffset() const;
	uint64_t getSize() const;
	OutputSection *getOutputSection() const;

	// The following two functions are used for symbol resolution.
	//
	// You are expected to call mergeProperties for all symbols in input
	// files so that attributes that are attached to names rather than
	// indivisual symbol (such as visibility) are merged together.
	//
	// Every time you read a new symbol from an input, you are supposed
	// to call resolve() with the new symbol. That function replaces
	// "this" object as a result of name resolution if the new symbol is
	// more appropriate to be included in the output.
	//
	// For example, if "this" is an undefined symbol and a new symbol is
	// a defined symbol, "this" is replaced with the new symbol.
	void mergeProperties(const Symbol &other);
	void resolve(const Symbol &other);

	// If this is a lazy symbol, fetch an input file and add the symbol
	// in the file to the symbol table. Calling this function on
	// non-lazy object causes a runtime error.
	void fetch() const;

	private:
	static bool isExportDynamic(Kind k, uint8_t visibility) {
	if (k == SharedKind)
	return visibility == llvm::ELF::STV_DEFAULT;
	return config->shared \|\| config->exportDynamic;
	}

	void resolveUndefined(const Undefined &other);
	void resolveCommon(const CommonSymbol &other);
	void resolveDefined(const Defined &other);
	template <class LazyT> void resolveLazy(const LazyT &other);
	void resolveShared(const SharedSymbol &other);

	int compare(const Symbol *other) const;

	inline size_t getSymbolSize() const;

	protected:
	Symbol(Kind k, InputFile *file, StringRefZ name, uint8_t binding,
	uint8_t stOther, uint8_t type)
	: file(file), nameData(name.data), nameSize(name.size), binding(binding),
	type(type), stOther(stOther), symbolKind(k), visibility(stOther & 3),
	isUsedInRegularObj(!file \|\| file->kind() == InputFile::ObjKind),
	exportDynamic(isExportDynamic(k, visibility)), canInline(false),
	traced(false), needsPltAddr(false), isInIplt(false), gotInIgot(false),
	isPreemptible(false), used(!config->gcSections), needsTocRestore(false),
	scriptDefined(false) {}

	public:
	// True the symbol should point to its PLT entry.
	// For SharedSymbol only.
	unsigned needsPltAddr : 1;

	// True if this symbol is in the Iplt sub-section of the Plt and the Igot
	// sub-section of the .got.plt or .got.
	unsigned isInIplt : 1;

	// True if this symbol needs a GOT entry and its GOT entry is actually in
	// Igot. This will be true only for certain non-preemptible ifuncs.
	unsigned gotInIgot : 1;

	// True if this symbol is preemptible at load time.
	unsigned isPreemptible : 1;

	// True if an undefined or shared symbol is used from a live section.
	unsigned used : 1;

	// True if a call to this symbol needs to be followed by a restore of the
	// PPC64 toc pointer.
	unsigned needsTocRestore : 1;

	// True if this symbol is defined by a linker script.
	unsigned scriptDefined : 1;

	// The partition whose dynamic symbol table contains this symbol's definition.
	uint8_t partition = 1;

	bool isSection() const { return type == llvm::ELF::STT_SECTION; }
	bool isTls() const { return type == llvm::ELF::STT_TLS; }
	bool isFunc() const { return type == llvm::ELF::STT_FUNC; }
	bool isGnuIFunc() const { return type == llvm::ELF::STT_GNU_IFUNC; }
	bool isObject() const { return type == llvm::ELF::STT_OBJECT; }
	bool isFile() const { return type == llvm::ELF::STT_FILE; }
	};

	// Represents a symbol that is defined in the current output file.
	class Defined : public Symbol {
	public:
	Defined(InputFile *file, StringRefZ name, uint8_t binding, uint8_t stOther,
	uint8_t type, uint64_t value, uint64_t size, SectionBase *section)
	: Symbol(DefinedKind, file, name, binding, stOther, type), value(value),
	size(size), section(section) {}

	static bool classof(const Symbol *s) { return s->isDefined(); }

	uint64_t value;
	uint64_t size;
	SectionBase *section;
	};

	// Represents a common symbol.
	//
	// On Unix, it is traditionally allowed to write variable definitions
	// without initialization expressions (such as "int foo;") to header
	// files. Such definition is called "tentative definition".
	//
	// Using tentative definition is usually considered a bad practice
	// because you should write only declarations (such as "extern int
	// foo;") to header files. Nevertheless, the linker and the compiler
	// have to do something to support bad code by allowing duplicate
	// definitions for this particular case.
	//
	// Common symbols represent variable definitions without initializations.
	// The compiler creates common symbols when it sees varaible definitions
	// without initialization (you can suppress this behavior and let the
	// compiler create a regular defined symbol by -fno-common).
	//
	// The linker allows common symbols to be replaced by regular defined
	// symbols. If there are remaining common symbols after name resolution is
	// complete, they are converted to regular defined symbols in a .bss
	// section. (Therefore, the later passes don't see any CommonSymbols.)
	class CommonSymbol : public Symbol {
	public:
	CommonSymbol(InputFile *file, StringRefZ name, uint8_t binding,
	uint8_t stOther, uint8_t type, uint64_t alignment, uint64_t size)
	: Symbol(CommonKind, file, name, binding, stOther, type),
	alignment(alignment), size(size) {}

	static bool classof(const Symbol *s) { return s->isCommon(); }

	uint32_t alignment;
	uint64_t size;
	};

	class Undefined : public Symbol {
	public:
	Undefined(InputFile *file, StringRefZ name, uint8_t binding, uint8_t stOther,
	uint8_t type, uint32_t discardedSecIdx = 0)
	: Symbol(UndefinedKind, file, name, binding, stOther, type),
	discardedSecIdx(discardedSecIdx) {}

	static bool classof(const Symbol *s) { return s->kind() == UndefinedKind; }

	// The section index if in a discarded section, 0 otherwise.
	uint32_t discardedSecIdx;
	};

	class SharedSymbol : public Symbol {
	public:
	static bool classof(const Symbol *s) { return s->kind() == SharedKind; }

	SharedSymbol(InputFile &file, StringRef name, uint8_t binding,
	uint8_t stOther, uint8_t type, uint64_t value, uint64_t size,
	uint32_t alignment, uint32_t verdefIndex)
	: Symbol(SharedKind, &file, name, binding, stOther, type), value(value),
	size(size), alignment(alignment) {
	this->verdefIndex = verdefIndex;
	// GNU ifunc is a mechanism to allow user-supplied functions to
	// resolve PLT slot values at load-time. This is contrary to the
	// regular symbol resolution scheme in which symbols are resolved just
	// by name. Using this hook, you can program how symbols are solved
	// for you program. For example, you can make "memcpy" to be resolved
	// to a SSE-enabled version of memcpy only when a machine running the
	// program supports the SSE instruction set.
	//
	// Naturally, such symbols should always be called through their PLT
	// slots. What GNU ifunc symbols point to are resolver functions, and
	// calling them directly doesn't make sense (unless you are writing a
	// loader).
	//
	// For DSO symbols, we always call them through PLT slots anyway.
	// So there's no difference between GNU ifunc and regular function
	// symbols if they are in DSOs. So we can handle GNU_IFUNC as FUNC.
	if (this->type == llvm::ELF::STT_GNU_IFUNC)
	this->type = llvm::ELF::STT_FUNC;
	}

	SharedFile &getFile() const { return *cast<SharedFile>(file); }

	uint64_t value; // st_value
	uint64_t size; // st_size
	uint32_t alignment;

	// This is true if there has been at least one undefined reference to the
	// symbol. The binding may change to STB_WEAK if the first undefined reference
	// is weak.
	bool referenced = false;
	};

	// LazyArchive and LazyObject represent a symbols that is not yet in the link,
	// but we know where to find it if needed. If the resolver finds both Undefined
	// and Lazy for the same name, it will ask the Lazy to load a file.
	//
	// A special complication is the handling of weak undefined symbols. They should
	// not load a file, but we have to remember we have seen both the weak undefined
	// and the lazy. We represent that with a lazy symbol with a weak binding. This
	// means that code looking for undefined symbols normally also has to take lazy
	// symbols into consideration.

	// This class represents a symbol defined in an archive file. It is
	// created from an archive file header, and it knows how to load an
	// object file from an archive to replace itself with a defined
	// symbol.
	class LazyArchive : public Symbol {
	public:
	LazyArchive(InputFile &file, const llvm::object::Archive::Symbol s)
	: Symbol(LazyArchiveKind, &file, s.getName(), llvm::ELF::STB_GLOBAL,
	llvm::ELF::STV_DEFAULT, llvm::ELF::STT_NOTYPE),
	sym(s) {}

	static bool classof(const Symbol *s) { return s->kind() == LazyArchiveKind; }

	MemoryBufferRef getMemberBuffer();

	const llvm::object::Archive::Symbol sym;
	};

	// LazyObject symbols represents symbols in object files between
	// --start-lib and --end-lib options.
	class LazyObject : public Symbol {
	public:
	LazyObject(InputFile &file, StringRef name)
	: Symbol(LazyObjectKind, &file, name, llvm::ELF::STB_GLOBAL,
	llvm::ELF::STV_DEFAULT, llvm::ELF::STT_NOTYPE) {}

	static bool classof(const Symbol *s) { return s->kind() == LazyObjectKind; }
	};

	// Some linker-generated symbols need to be created as
	// Defined symbols.
	struct ElfSym {
	// __bss_start
	static Defined *bss;

	// etext and _etext
	static Defined *etext1;
	static Defined *etext2;

	// edata and _edata
	static Defined *edata1;
	static Defined *edata2;

	// end and _end
	static Defined *end1;
	static Defined *end2;

	// The _GLOBAL_OFFSET_TABLE_ symbol is defined by target convention to
	// be at some offset from the base of the .got section, usually 0 or
	// the end of the .got.
	static Defined *globalOffsetTable;

	// _gp, _gp_disp and __gnu_local_gp symbols. Only for MIPS.
	static Defined *mipsGp;
	static Defined *mipsGpDisp;
	static Defined *mipsLocalGp;

	// __rel{,a}_iplt_{start,end} symbols.
	static Defined *relaIpltStart;
	static Defined *relaIpltEnd;

	// __global_pointer$ for RISC-V.
	static Defined *riscvGlobalPointer;

	// _TLS_MODULE_BASE_ on targets that support TLSDESC.
	static Defined *tlsModuleBase;
	};

	// A buffer class that is large enough to hold any Symbol-derived
	// object. We allocate memory using this class and instantiate a symbol
	// using the placement new.
	union SymbolUnion {
	alignas(Defined) char a[sizeof(Defined)];
	alignas(CommonSymbol) char b[sizeof(CommonSymbol)];
	alignas(Undefined) char c[sizeof(Undefined)];
	alignas(SharedSymbol) char d[sizeof(SharedSymbol)];
	alignas(LazyArchive) char e[sizeof(LazyArchive)];
	alignas(LazyObject) char f[sizeof(LazyObject)];
	};

	// It is important to keep the size of SymbolUnion small for performance and
	// memory usage reasons. 80 bytes is a soft limit based on the size of Defined
	// on a 64-bit system.
	static_assert(sizeof(SymbolUnion) <= 80, "SymbolUnion too large");

	template <typename T> struct AssertSymbol {
	static_assert(std::is_trivially_destructible<T>(),
	"Symbol types must be trivially destructible");
	static_assert(sizeof(T) <= sizeof(SymbolUnion), "SymbolUnion too small");
	static_assert(alignof(T) <= alignof(SymbolUnion),
	"SymbolUnion not aligned enough");
	};

	static inline void assertSymbols() {
	AssertSymbol<Defined>();
	AssertSymbol<CommonSymbol>();
	AssertSymbol<Undefined>();
	AssertSymbol<SharedSymbol>();
	AssertSymbol<LazyArchive>();
	AssertSymbol<LazyObject>();
	}

	void printTraceSymbol(const Symbol *sym);

	size_t Symbol::getSymbolSize() const {
	switch (kind()) {
	case CommonKind:
	return sizeof(CommonSymbol);
	case DefinedKind:
	return sizeof(Defined);
	case LazyArchiveKind:
	return sizeof(LazyArchive);
	case LazyObjectKind:
	return sizeof(LazyObject);
	case SharedKind:
	return sizeof(SharedSymbol);
	case UndefinedKind:
	return sizeof(Undefined);
	case PlaceholderKind:
	return sizeof(Symbol);
	}
	llvm_unreachable("unknown symbol kind");
	}

	// replace() replaces "this" object with a given symbol by memcpy'ing
	// it over to "this". This function is called as a result of name
	// resolution, e.g. to replace an undefind symbol with a defined symbol.
	void Symbol::replace(const Symbol &New) {
	using llvm::ELF::STT_TLS;

	// Symbols representing thread-local variables must be referenced by
	// TLS-aware relocations, and non-TLS symbols must be reference by
	// non-TLS relocations, so there's a clear distinction between TLS
	// and non-TLS symbols. It is an error if the same symbol is defined
	// as a TLS symbol in one file and as a non-TLS symbol in other file.
	if (symbolKind != PlaceholderKind && !isLazy() && !New.isLazy()) {
	bool tlsMismatch = (type == STT_TLS && New.type != STT_TLS) \|\|
	(type != STT_TLS && New.type == STT_TLS);
	if (tlsMismatch)
	error("TLS attribute mismatch: " + toString(*this) + "\n>>> defined in " +
	toString(New.file) + "\n>>> defined in " + toString(file));
	}

	Symbol old = *this;
	memcpy(this, &New, New.getSymbolSize());

	versionId = old.versionId;
	visibility = old.visibility;
	isUsedInRegularObj = old.isUsedInRegularObj;
	exportDynamic = old.exportDynamic;
	canInline = old.canInline;
	traced = old.traced;
	isPreemptible = old.isPreemptible;
	scriptDefined = old.scriptDefined;
	partition = old.partition;

	// Symbol length is computed lazily. If we already know a symbol length,
	// propagate it.
	if (nameData == old.nameData && nameSize == 0 && old.nameSize != 0)
	nameSize = old.nameSize;

	// Print out a log message if --trace-symbol was specified.
	// This is for debugging.
	if (traced)
	printTraceSymbol(this);
	}

	void maybeWarnUnorderableSymbol(const Symbol *sym);
	} // namespace elf
	} // namespace lld

	#endif
	Index: projects/clang900-import/contrib/llvm/tools/lld/docs/ReleaseNotes.rst
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld/docs/ReleaseNotes.rst (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld/docs/ReleaseNotes.rst (revision 351722)
	@@ -1,101 +1,88 @@
	=======================
	lld 9.0.0 Release Notes
	=======================

	.. contents::
	:local:

	-.. warning::
	- These are in-progress notes for the upcoming LLVM 9.0.0 release.
	- Release notes for previous releases can be found on
	- `the Download Page <https://releases.llvm.org/download.html>`_.
	-
	Introduction
	============

	This document contains the release notes for the lld linker, release 9.0.0.
	Here we describe the status of lld, including major improvements
	from the previous release. All lld releases may be downloaded
	from the `LLVM releases web site <https://llvm.org/releases/>`_.

	Non-comprehensive list of changes in this release
	=================================================

	ELF Improvements
	----------------

	* ld.lld now has typo suggestions for flags:
	``$ ld.lld --call-shared`` now prints
	``unknown argument '--call-shared', did you mean '--call_shared'``.

	* lld now supports replacing ``JAL`` with ``JALX`` instructions in case
	of MIPS - microMIPS cross-mode jumps.

	* lld now creates LA25 thunks for MIPS R6 code.

	* Put MIPS-specific .reginfo, .MIPS.options, and .MIPS.abiflags sections
	into corresponding PT_MIPS_REGINFO, PT_MIPS_OPTIONS, and PT_MIPS_ABIFLAGS
	segments.

	-* ...
	-
	COFF Improvements
	-----------------

	* Like the ELF driver, lld-link now has typo suggestions for flags.

	* lld-link now correctly reports duplicate symbol errors for obj files
	that were compiled with /Gy.

	* lld-link now correctly reports duplicate symbol errors when several res
	input files define resources with the same type, name, and language.
	This can be demoted to a warning using ``/force:multipleres``.

	+* lld-link now rejects more than one resource obj input files, matching
	+ link.exe. Previously, lld-link would silently ignore all but one.
	+ If you hit this: Don't pass resource obj files to the linker, instead pass
	+ res files to the linker directly. Don't put res files in static libraries,
	+ pass them on the command line.
	+
	* Having more than two ``/natvis:`` now works correctly; it used to not
	work for larger binaries before.

	* Undefined symbols are now printed only in demangled form. Pass
	``/demangle:no`` to see raw symbol names instead.

	* The following flags have been added: ``/functionpadmin``, ``/swaprun:``,
	``/threads:no``

	* Several speed and memory usage improvements.

	* Range extension thunks are now created for ARM64, if needed

	* lld-link now supports resource object files created by GNU windres and
	MS cvtres, not only llvm-cvtres

	* The generated thunks for delayimports now share the majority of code
	among thunks, significantly reducing the overhead of using delayimport

	-* ...
	-
	MinGW Improvements
	------------------

	* lld now correctly links crtend.o as the last object file, handling
	terminators for the sections such as .eh_frame properly, fixing
	DWARF exception handling with libgcc and gcc's crtend.o.

	* lld now also handles DWARF unwind info generated by GCC, when linking
	with libgcc

	* Many more GNU ld options are now supported, which e.g. allows the lld
	MinGW frontend to be called by GCC

	* PDB output can be requested without manually specifying the PDB file
	name, with the new option ``-pdb=`` with an empty value to the option.
	(The old existing syntax ``-pdb <filename>`` was more cumbersome to use
	with an empty parameter value.)
	-
	-MachO Improvements
	-------------------
	-
	-* Item 1.
	-
	-WebAssembly Improvements
	-------------------------
	-
	-* ...
	Index: projects/clang900-import/contrib/llvm/tools/lld
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lld (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lld (revision 351722)

	Property changes on: projects/clang900-import/contrib/llvm/tools/lld
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lld/dist-release_90:r351684-351721
	Index: projects/clang900-import/contrib/llvm/tools/lldb
	===================================================================
	--- projects/clang900-import/contrib/llvm/tools/lldb (revision 351721)
	+++ projects/clang900-import/contrib/llvm/tools/lldb (revision 351722)

	Property changes on: projects/clang900-import/contrib/llvm/tools/lldb
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lldb/dist-release_90:r351684-351721
	Index: projects/clang900-import/contrib/llvm
	===================================================================
	--- projects/clang900-import/contrib/llvm (revision 351721)
	+++ projects/clang900-import/contrib/llvm (revision 351722)

	Property changes on: projects/clang900-import/contrib/llvm
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm/dist-release_90:r351684-351721
	Index: projects/clang900-import/contrib/openmp
	===================================================================
	--- projects/clang900-import/contrib/openmp (revision 351721)
	+++ projects/clang900-import/contrib/openmp (revision 351722)

	Property changes on: projects/clang900-import/contrib/openmp
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm-openmp/dist-release_90:r351684-351721
	Index: projects/clang900-import/lib/clang/include/lld/Common/Version.inc
	===================================================================
	--- projects/clang900-import/lib/clang/include/lld/Common/Version.inc (revision 351721)
	+++ projects/clang900-import/lib/clang/include/lld/Common/Version.inc (revision 351722)
	@@ -1,10 +1,10 @@
	// $FreeBSD$

	#define LLD_VERSION 9.0.0
	#define LLD_VERSION_STRING "9.0.0"
	#define LLD_VERSION_MAJOR 9
	#define LLD_VERSION_MINOR 0

	// <Upstream revision at import>-<Local identifier in __FreeBSD_version style>
	-#define LLD_REVISION "369369-1300004"
	+#define LLD_REVISION "370514-1300004"
	#define LLD_REPOSITORY "FreeBSD"
	Index: projects/clang900-import/lib/clang/include/llvm/Support/VCSRevision.h
	===================================================================
	--- projects/clang900-import/lib/clang/include/llvm/Support/VCSRevision.h (revision 351721)
	+++ projects/clang900-import/lib/clang/include/llvm/Support/VCSRevision.h (revision 351722)
	@@ -1,3 +1,3 @@
	/* $FreeBSD$ */
	-#define LLVM_REVISION "369369"
	+#define LLVM_REVISION "370514"
	#define LLVM_REPOSITORY "https://llvm.org/svn/llvm-project/llvm/branches/release_90"

File Metadata

Mime Type: application/octet-stream
Expires: Wed, Jan 29, 3:16 PM (1 d, 23 h)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: EHwczIWQ0f3E
Default Alt Text: (6 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions